From cf58b897872e515ca0784ca15a6ed3d047c17e6d Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 7 Jun 2021 08:14:52 +0000
Subject: [PATCH 01/80] working on row and column conversions

---
 cpp/CMakeLists.txt                            |    1 +
 cpp/benchmarks/CMakeLists.txt                 |    4 +
 .../row_conversion/row_conversion.cpp         |  116 ++
 cpp/include/cudf/row_conversion.hpp           |   51 +
 cpp/src/row_conversion/row_conversion.cu      | 1106 +++++++++++++++++
 5 files changed, 1278 insertions(+)
 create mode 100644 cpp/benchmarks/row_conversion/row_conversion.cpp
 create mode 100644 cpp/include/cudf/row_conversion.hpp
 create mode 100644 cpp/src/row_conversion/row_conversion.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 82bc5bfba93..785ac1f72de 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -350,6 +350,7 @@ add_library(cudf
     src/rolling/rolling.cu
     src/rolling/rolling_collect_list.cu
     src/round/round.cu
+    src/row_conversion/row_conversion.cu
     src/scalar/scalar.cpp
     src/scalar/scalar_factories.cpp
     src/search/search.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b3b92003573..7d353c37df7 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -250,3 +250,7 @@ ConfigureBench(JSON_BENCH
 # - io benchmark ---------------------------------------------------------------------
 ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
   io/text/multibyte_split_benchmark.cpp)
+
+###################################################################################################
+# - row conversion benchmark ---------------------------------------------------------
+ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp)
diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
new file mode 100644
index 00000000000..c4edee91b3c
--- /dev/null
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/row_conversion.hpp>
+#include "cudf_test/column_utilities.hpp"
+
+class RowConversion : public cudf::benchmark {
+};
+
+static void BM_to_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         50,
+                                         row_count{n_rows});
+
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    total_bytes += cudf::size_of(t);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+//    auto rows = cudf::convert_to_rows(table->view());
+    auto new_rows = cudf::convert_to_rows2(table->view());
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+static void BM_from_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         256,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+                                           4,
+                                           row_count{n_rows});*/
+
+  std::vector<cudf::data_type> schema;
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    schema.push_back(t);
+    total_bytes += cudf::size_of(t);
+  }
+
+  auto rows = cudf::convert_to_rows(table->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto out = cudf::convert_from_rows(rows, schema);
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+  BENCHMARK_DEFINE_F(RowConversion, name)        \
+  (::benchmark::State & st) { BM_to_row(st); }   \
+  BENCHMARK_REGISTER_F(RowConversion, name)      \
+    ->RangeMultiplier(8)                         \
+    ->Ranges({{1 << 16, 1 << 24}})               \
+    ->UseManualTime()                            \
+    ->Unit(benchmark::kMillisecond);
+
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
+
+#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+  BENCHMARK_DEFINE_F(RowConversion, name)          \
+  (::benchmark::State & st) { BM_from_row(st); }   \
+  BENCHMARK_REGISTER_F(RowConversion, name)        \
+    ->RangeMultiplier(8)                           \
+    ->Ranges({{1 << 6, 1 << 22}})                  \
+    ->UseManualTime()                              \
+    ->Unit(benchmark::kMillisecond);
+
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
new file mode 100644
index 00000000000..f5e2225ad19
--- /dev/null
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
+  cudf::table_view const &tbl,
+  // TODO need something for validity
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(
+  cudf::table_view const &tbl,
+  // TODO need something for validity
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows(
+  cudf::lists_column_view const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
new file mode 100644
index 00000000000..fb5dc4cb38d
--- /dev/null
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <iterator>
+#include <limits>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/sequence.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cudf/row_conversion.hpp>
+#include "cudf/types.hpp"
+#include "rmm/device_buffer.hpp"
+#include "thrust/iterator/counting_iterator.h"
+#include "thrust/iterator/transform_iterator.h"
+
+namespace cudf {
+
+namespace detail {
+
+static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment)
+{
+  return (offset + alignment - 1) & ~(alignment - 1);
+}
+
+
+/**
+ * Copy a simple vector to device memory asynchronously. Be sure to read
+ * the data on the same stream as is used to copy it.
+ */
+template <typename T>
+std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource *mr)
+{
+  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
+  CUDA_TRY(cudaMemcpyAsync(
+    ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
+  return ret;
+}
+
+template <typename T>
+rmm::device_uvector<T> copy_to_dev_async2(
+  const std::vector<T> &input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  rmm::device_uvector<T> ret(input.size(), stream, mr);
+  CUDA_TRY(cudaMemcpyAsync(
+    ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
+  return ret;
+}
+
+__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
+                                            const cudf::size_type num_columns,
+                                            const cudf::size_type row_size,
+                                            const cudf::size_type *input_offset_in_row,
+                                            const cudf::size_type *num_bytes,
+                                            int8_t **output_data,
+                                            cudf::bitmask_type **output_nm,
+                                            const int8_t *input_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // For simplicity we will refer to this as a row_group
+
+  // In practice we have found writing more than 4 columns of data per thread
+  // results in performance loss. As such we are using a 2 dimensional
+  // kernel in terms of threads, but not in terms of blocks. Columns are
+  // controlled by the y dimension (there is no y dimension in blocks). Rows
+  // are controlled by the x dimension (there are multiple blocks in the x
+  // dimension).
+
+  cudf::size_type rows_per_group   = blockDim.x;
+  cudf::size_type row_group_start  = blockIdx.x;
+  cudf::size_type row_group_stride = gridDim.x;
+  cudf::size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+
+  extern __shared__ int8_t shared_data[];
+
+  // Because we are copying fixed width only data and we stride the rows
+  // this thread will always start copying from shared data in the same place
+  int8_t *row_tmp     = &shared_data[row_size * threadIdx.x];
+  int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+
+  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+       row_group_index += row_group_stride) {
+    // Step 1: Copy the data into shared memory
+    // We know row_size is always aligned with and a multiple of int64_t;
+    int64_t *long_shared      = reinterpret_cast<int64_t *>(shared_data);
+    const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
+
+    cudf::size_type shared_output_index  = threadIdx.x + (threadIdx.y * blockDim.x);
+    cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
+    cudf::size_type row_index_end        = ((row_group_index + 1) * rows_per_group);
+    if (row_index_end > num_rows) { row_index_end = num_rows; }
+    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    cudf::size_type shared_length     = row_size * num_rows_in_group;
+
+    cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
+
+    cudf::size_type start_input_index =
+      (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+
+    for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
+         shared_index += shared_output_stride) {
+      long_shared[shared_index] = long_input[start_input_index + shared_index];
+    }
+    // Wait for all of the data to be in shared memory
+    __syncthreads();
+
+    // Step 2 copy the data back out
+
+    // Within the row group there should be 1 thread for each row.  This is a
+    // requirement for launching the kernel
+    cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
+    // But we might not use all of the threads if the number of rows does not go
+    // evenly into the thread count. We don't want those threads to exit yet
+    // because we may need them to copy data in for the next row group.
+    uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
+    if (row_index < num_rows) {
+      cudf::size_type col_index_start  = threadIdx.y;
+      cudf::size_type col_index_stride = blockDim.y;
+      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+           col_index += col_index_stride) {
+        cudf::size_type col_size = num_bytes[col_index];
+        const int8_t *col_tmp    = &(row_tmp[input_offset_in_row[col_index]]);
+        int8_t *col_output       = output_data[col_index];
+        switch (col_size) {
+          case 1: {
+            col_output[row_index] = *col_tmp;
+            break;
+          }
+          case 2: {
+            int16_t *short_col_output   = reinterpret_cast<int16_t *>(col_output);
+            short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
+            break;
+          }
+          case 4: {
+            int32_t *int_col_output   = reinterpret_cast<int32_t *>(col_output);
+            int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
+            break;
+          }
+          case 8: {
+            int64_t *long_col_output   = reinterpret_cast<int64_t *>(col_output);
+            long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
+            break;
+          }
+          default: {
+            cudf::size_type output_offset = col_size * row_index;
+            // TODO this should just not be supported for fixed width columns, but just in case...
+            for (cudf::size_type b = 0; b < col_size; b++) {
+              col_output[b + output_offset] = col_tmp[b];
+            }
+            break;
+          }
+        }
+
+        cudf::bitmask_type *nm          = output_nm[col_index];
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
+        int predicate                   = *valid_byte & (1 << byte_bit_offset);
+        uint32_t bitmask                = __ballot_sync(active_mask, predicate);
+        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
+      }  // end column loop
+    }    // end row copy
+    // wait for the row_group to be totally copied before starting on the next row group
+    __syncthreads();
+  }
+}
+
+__global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
+                                              const cudf::size_type num_rows,
+                                              const cudf::size_type num_columns,
+                                              const cudf::size_type row_size,
+                                              const cudf::size_type *output_offset_in_row,
+                                              const cudf::size_type *num_bytes,
+                                              const int8_t **input_data,
+                                              const cudf::bitmask_type **input_nm,
+                                              int8_t *output_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // We do not support copying a subset of the columns in a row yet, so we don't
+  // currently support a row that is wider than shared memory.
+  // For simplicity we will refer to this as a row_group
+
+  // In practice we have found reading more than 4 columns of data per thread
+  // results in performance loss. As such we are using a 2 dimensional
+  // kernel in terms of threads, but not in terms of blocks. Columns are
+  // controlled by the y dimension (there is no y dimension in blocks). Rows
+  // are controlled by the x dimension (there are multiple blocks in the x
+  // dimension).
+
+  cudf::size_type rows_per_group   = blockDim.x;
+  cudf::size_type row_group_start  = blockIdx.x;
+  cudf::size_type row_group_stride = gridDim.x;
+  cudf::size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+
+  extern __shared__ int8_t shared_data[];
+
+  // Because we are copying fixed width only data and we stride the rows
+  // this thread will always start copying to shared data in the same place
+  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
+  int8_t *row_vld_tmp =
+    &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+
+  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+       row_group_index += row_group_stride) {
+    // Within the row group there should be 1 thread for each row.  This is a
+    // requirement for launching the kernel
+    cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
+    // But we might not use all of the threads if the number of rows does not go
+    // evenly into the thread count. We don't want those threads to exit yet
+    // because we may need them to copy data back out.
+    if (row_index < (start_row + num_rows)) {
+      cudf::size_type col_index_start  = threadIdx.y;
+      cudf::size_type col_index_stride = blockDim.y;
+      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+           col_index += col_index_stride) {
+        cudf::size_type col_size = num_bytes[col_index];
+        int8_t *col_tmp          = &(row_tmp[output_offset_in_row[col_index]]);
+        const int8_t *col_input  = input_data[col_index];
+        switch (col_size) {
+          case 1: {
+            *col_tmp = col_input[row_index];
+            break;
+          }
+          case 2: {
+            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(col_input);
+            *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
+            break;
+          }
+          case 4: {
+            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(col_input);
+            *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
+            break;
+          }
+          case 8: {
+            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(col_input);
+            *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
+            break;
+          }
+          default: {
+            cudf::size_type input_offset = col_size * row_index;
+            // TODO this should just not be supported for fixed width columns, but just in case...
+            for (cudf::size_type b = 0; b < col_size; b++) {
+              col_tmp[b] = col_input[b + input_offset];
+            }
+            break;
+          }
+        }
+        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
+        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
+        uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
+        int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
+        cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
+        // Now copy validity for the column
+        if (input_nm[col_index]) {
+          if (bit_is_set(input_nm[col_index], row_index)) {
+            atomicOr_block(valid_int, 1 << int_bit_offset);
+          } else {
+            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
+          }
+        } else {
+          // It is valid so just set the bit
+          atomicOr_block(valid_int, 1 << int_bit_offset);
+        }
+      }  // end column loop
+    }    // end row copy
+    // wait for the row_group to be totally copied into shared memory
+    __syncthreads();
+
+    // Step 2: Copy the data back out
+    // We know row_size is always aligned with and a multiple of int64_t;
+    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
+    int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
+
+    cudf::size_type shared_input_index  = threadIdx.x + (threadIdx.y * blockDim.x);
+    cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
+    cudf::size_type row_index_end       = ((row_group_index + 1) * rows_per_group);
+    if (row_index_end > num_rows) { row_index_end = num_rows; }
+    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    cudf::size_type shared_length     = row_size * num_rows_in_group;
+
+    cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
+
+    cudf::size_type start_output_index =
+      (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+
+    for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
+         shared_index += shared_input_stride) {
+      long_output[start_output_index + shared_index] = long_shared[shared_index];
+    }
+    __syncthreads();
+    // Go for the next round
+  }
+}
+
+struct block_info {
+  int start_col;
+  int start_row;
+  int end_col;
+  int end_row;
+  int buffer_num;
+};
+
+/**
+ * @brief copy data from cudf columns into x format, which is row-based
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param input_data pointer to raw table data
+ * @param input_nm pointer to validity data
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param row_offsets offset to a specific row in the input data
+ * @param output_data pointer to output data
+ * 
+ */
+__global__ void copy_from_columns(const cudf::size_type num_rows,
+                                  const cudf::size_type num_columns,
+                                  const int8_t **input_data,
+                                  const cudf::bitmask_type **input_nm,
+                                  const cudf::size_type *col_sizes,
+                                  const cudf::size_type *col_offsets,
+                                  const block_info *block_infos,
+                                  const uint64_t *row_offsets,
+                                  int8_t **output_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  auto block = block_infos[blockIdx.x];
+  extern __shared__ int8_t shared_data[];
+  uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
+  uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
+
+    printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x);
+
+  // each thread is responsible for every threadcount rows of data.
+  // the data is copies into shared memory in the final layout.
+  auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
+  auto const validity_offset = col_offsets[num_columns];
+  for (int col=block.start_col; col<=block.end_col; ++col) {
+    /*if (!col_is_variable) */{
+      uint64_t col_offset = 0;
+      cudf::size_type col_size = col_sizes[col];
+      auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
+        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * row];
+        switch (col_size) {
+          case 1: {
+            *shmem_dest = input_data[col][row];
+            break;
+          }
+          case 2: {
+            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
+            *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
+            break;
+          }
+          case 4: {
+            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
+            break;
+          }
+          case 8: {
+            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
+            *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
+            break;
+          }
+          default: {
+            cudf::size_type input_offset = col_size * row;
+            // TODO this should just not be supported for fixed width columns, but just in case...
+            for (cudf::size_type b = 0; b < col_size; b++) {
+              shmem_dest[b] = input_data[col][b + input_offset];
+            }
+            break;
+          }
+        }
+
+        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
+        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
+        // we do this directly in the final location because the entire row may not
+        // fit in shared memory and may require many blocks to process it entirely
+        int8_t *valid_byte              = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
+        cudf::size_type byte_bit_offset = col % 8;
+        uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
+        int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
+        cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
+        // Now copy validity for the column
+        if (input_nm[col]) {
+          if (bit_is_set(input_nm[col], row)) {
+            atomicOr_block(valid_int, 1 << int_bit_offset);
+          } else {
+            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
+          }
+        } else {
+          // It is valid so just set the bit
+          atomicOr_block(valid_int, 1 << int_bit_offset);
+        }
+      } // end row
+
+      col_offset += col_sizes[col] * (block.end_row - block.start_row);
+    }
+  } // end col
+
+  // wait for the data to be totally copied into shared memory
+  __syncthreads();
+
+  // Step 2: Copy the data from shared memory to final destination
+  // each block is potentially a slice of the table, so no assumptions
+  // can be made about alignments. We do know that the alignment in shared
+  // memory matches the final destination alignment. Also note that
+  // we are not writing to entirely contiguous destinations as each
+  // row in shared memory may not be an entire row of the destination.
+  //
+  auto const thread_start_offset = threadIdx.x * 8;
+  auto const thread_stride = gridDim.x * 8;
+  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) {
+    auto const output_row_num = src_offset / shmem_row_size;
+    auto const row_offset = row_offsets[block.start_row + output_row_num];
+    auto const col_offset = src_offset % shmem_row_size;
+    int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset];
+    int8_t *input_ptr = &shared_data[src_offset];
+    // the first part and last part of the row is unaligned data copy. This is copied a single byte
+    // at a time.
+    if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
+      // first part of a row, copy single bytes
+      auto const num_single_bytes = 8 - dest_shim_offset;
+      for (auto i=0; i<num_single_bytes; ++i) {
+        output_ptr[i] = input_ptr[i + dest_shim_offset];
+      }
+    } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) {
+      // last part of a row, copy single bytes
+      auto const num_single_bytes = dest_shim_offset;
+      for (auto i=0; i<num_single_bytes; ++i) {
+        output_ptr[i] = input_ptr[i + dest_shim_offset];
+      }
+    } else {
+      // copy 8 bytes aligned
+      const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_ptr);
+      *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
+    }
+  }
+}
+
+/**
+ * Calculate the dimensions of the kernel for fixed width only columns.
+ * @param [in] num_columns the number of columns being copied.
+ * @param [in] num_rows the number of rows being copied.
+ * @param [in] size_per_row the size each row takes up when padded.
+ * @param [out] blocks the size of the blocks for the kernel
+ * @param [out] threads the size of the threads for the kernel
+ * @return the size in bytes of shared memory needed for each block.
+ */
+static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
+                                        const cudf::size_type num_rows,
+                                        const cudf::size_type size_per_row,
+                                        dim3 &blocks,
+                                        dim3 &threads)
+{
+  // We have found speed degrades when a thread handles more than 4 columns.
+  // Each block is 2 dimensional. The y dimension indicates the columns.
+  // We limit this to 32 threads in the y dimension so we can still
+  // have at least 32 threads in the x dimension (1 warp) which should
+  // result in better coalescing of memory operations. We also
+  // want to guarantee that we are processing a multiple of 32 threads
+  // in the x dimension because we use atomic operations at the block
+  // level when writing validity data out to main memory, and that would
+  // need to change if we split a word of validity data between blocks.
+  int y_block_size = (num_columns + 3) / 4;
+  if (y_block_size > 32) { y_block_size = 32; }
+  int x_possible_block_size = 1024 / y_block_size;
+  // 48KB is the default setting for shared memory per block according to the cuda tutorials
+  // If someone configures the GPU to only have 16 KB this might not work.
+  int max_shared_size = 48 * 1024;
+  int max_block_size  = max_shared_size / size_per_row;
+  // If we don't have enough shared memory there is no point in having more threads
+  // per block that will just sit idle
+  max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
+  // Make sure that the x dimension is a multiple of 32 this not only helps
+  // coalesce memory access it also lets us do a ballot sync for validity to write
+  // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
+  // dimension is associated with one or more warps, that should correspond to the validity
+  // words directly.
+  int block_size = (max_block_size / 32) * 32;
+  CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
+
+  int num_blocks = (num_rows + block_size - 1) / block_size;
+  if (num_blocks < 1) {
+    num_blocks = 1;
+  } else if (num_blocks > 10240) {
+    // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
+    // but in practice haveing too many can cause some overhead that I don't totally
+    // understand. Playing around with this haveing as little as 600 blocks appears
+    // to be able to saturate memory on V100, so this is an order of magnitude higher
+    // to try and future proof this a bit.
+    num_blocks = 10240;
+  }
+  blocks.x  = num_blocks;
+  blocks.y  = 1;
+  blocks.z  = 1;
+  threads.x = block_size;
+  threads.y = y_block_size;
+  threads.z = 1;
+  return size_per_row * block_size;
+}
+
+/**
+ * When converting to rows it is possible that the size of the table was too big to fit
+ * in a single column. This creates an output column for a subset of the rows in a table
+ * going from start row and containing the next num_rows.  Most of the parameters passed
+ * into this function are common between runs and should be calculated once.
+ */
+static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
+  const cudf::size_type start_row,
+  const cudf::size_type num_rows,
+  const cudf::size_type num_columns,
+  const cudf::size_type size_per_row,
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
+  std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
+  std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
+  const cudf::scalar &zero,
+  const cudf::scalar &scalar_size_per_row,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  int64_t total_allocation = size_per_row * num_rows;
+  // We made a mistake in the split somehow
+  CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
+
+  // Allocate and set the offsets row for the byte array
+  std::unique_ptr<cudf::column> offsets =
+    cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
+
+  std::unique_ptr<cudf::column> data =
+    cudf::make_numeric_column(cudf::data_type(cudf::type_id::INT8),
+                              static_cast<cudf::size_type>(total_allocation),
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+
+  dim3 blocks;
+  dim3 threads;
+  int shared_size =
+    detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+
+  copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+    start_row,
+    num_rows,
+    num_columns,
+    size_per_row,
+    column_start->data(),
+    column_size->data(),
+    input_data->data(),
+    input_nm->data(),
+    data->mutable_view().data<int8_t>());
+
+  return cudf::make_lists_column(num_rows,
+                                 std::move(offsets),
+                                 std::move(data),
+                                 0,
+                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+                                 stream,
+                                 mr);
+}
+
+static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); }
+
+static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema)
+{
+  return std::all_of(
+    schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
+}
+
+/**
+ * Given a set of fixed width columns, calculate how the data will be laid out in memory.
+ * @param [in] schema the types of columns that need to be laid out.
+ * @param [out] column_start the byte offset where each column starts in the row.
+ * @param [out] column_size the size in bytes of the data for each columns in the row.
+ * @return the size in bytes each row needs.
+ */
+static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
+                                                 std::vector<cudf::size_type> &column_start,
+                                                 std::vector<cudf::size_type> &column_size)
+{
+  // We guarantee that the start of each column is 64-bit aligned so anything can go
+  // there, but to make the code simple we will still do an alignment for it.
+  int32_t at_offset = 0;
+  for (auto col = schema.begin(); col < schema.end(); col++) {
+    cudf::size_type s = cudf::size_of(*col);
+    column_size.emplace_back(s);
+    std::size_t allocation_needed = s;
+    std::size_t alignment_needed  = allocation_needed;  // They are the same for fixed width types
+    at_offset                     = align_offset(at_offset, alignment_needed);
+    column_start.emplace_back(at_offset);
+    at_offset += allocation_needed;
+  }
+
+  // Now we need to add in space for validity
+  // Eventually we can think about nullable vs not nullable, but for now we will just always add it
+  // in
+  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  // validity comes at the end and is byte aligned so we can pack more in.
+  at_offset += validity_bytes_needed;
+  // Now we need to pad the end so all rows are 64 bit aligned
+  return align_offset(at_offset, 8);  // 8 bytes (64 bits)
+}
+
+}  // namespace detail
+
+//#define DEBUG
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr)
+{
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough
+  // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes.
+  constexpr int max_window_height = 1024;
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows    = tbl.num_rows();
+
+  #if defined(DEBUG)
+  auto pretty_print = [](uint64_t i) {
+    if (i > (1 * 1024 * 1024 * 1024)) {
+      printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
+    } else if (i > (1 * 1024 * 1024)) {
+      printf("%.2f MB", i / float(1 * 1024 * 1024));
+    } else if (i > (1 * 1024)) {
+      printf("%.2f KB", float(i / 1024));
+    } else {
+      printf("%lu Bytes", i);
+    }
+  };
+  #endif
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int shmem_limit_per_block;
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  // break up the work into blocks, which are a starting and ending row/col #.
+  // this window size is calculated based on the shared memory size available
+  // we want a single block to fill up the entire shared memory space available
+  // for the transpose-like conversion.
+
+  // There are two different processes going on here. The GPU conversion of the data
+  // and the writing of the data into the list of byte columns that are a maximum of
+  // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
+  // this limitation because the column must own the data inside and as a result it must be
+  // a distinct allocation for that column. Copying the data into these final buffers would
+  // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
+  // The windows are broken at the boundaries of specific rows based on the row sizes up
+  // to that point. These are row batches and they are decided first before building the
+  // windows so the windows can be properly cut around them.
+
+  std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
+  std::vector<uint64_t> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> column_sizes;  // byte size of each column
+  std::vector<size_type> column_starts; // offset of column inside a row including alignment
+  std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
+  row_sizes.reserve(num_rows);
+  row_offsets.reserve(num_rows);
+  column_sizes.reserve(num_columns);
+  column_starts.reserve(num_columns+1); // we add a final offset for validity data start
+
+  size_type fixed_width_size_per_row = 0;
+  for (int col = 0; col < num_columns; ++col) {
+    auto cv = tbl.column(col);
+    auto col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (nested_type) { variable_width_columns.push_back(cv);}
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed  = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row                  = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+  
+  // When building the columns to return, we have to be mindful of the offset limit in cudf.
+  // It is 32-bit and these data columns are capable of surpassing that easily. The data should
+  // not be cut off exactly at the limit though due to the validity buffers. The most efficient
+  // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+  // we keep track of the cut points for the validity, which we call row batches. If the row
+  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit.
+  // Note that this boundary is for our book-keeping with column pointers and not anything
+  // that the kernel needs to worry about. We cut the output at convienient boundaries
+  // when assembling the outgoing data stream.
+  struct row_batch {
+    size_type num_bytes;
+    size_type row_count;
+  };
+  std::vector<row_batch> row_batches;
+
+  auto calculate_variable_width_row_data_size = [](int const row) {
+    // each level of variable-width data will add an offset/length
+    // uint64 of data. The first of which is inside the fixed-width
+    // data itself and needs to be aligned based on what is around
+    // that data. This is handled above with the fixed-width calculations
+    // for that reason. We may still need to add more of these offset/length
+    // combinations if the nesting is deeper than one level as these
+    // will be included in the variable-width data blob at the end of the
+    // row.
+    return 0;
+/*      auto c = variable_width_columns[col];
+        while (true) {
+          auto col_offsets   = c.child(0).data<size_type>();
+          auto col_data_size = size_of(c.child(1).type());
+          std::size_t alignment_needed  = col_data_size;
+    
+        row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+        if (c.num_children() == 0) {
+          break;
+        }
+        c = c.child(1);
+      }
+*/
+  };
+
+  uint64_t row_batch_size   = 0;
+  uint64_t total_table_size = 0;
+  size_type row_batch_rows = 0;
+  uint64_t row_offset = 0;
+
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate
+  // the size of each row's variable-width data as well.
+  for (int row = 0; row < num_rows; ++row) {
+    row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row);
+    if (row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+      // a new batch starts at the last 32-row boundary
+      row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size = 0;
+      row_batch_rows = row_batch_rows & 31;
+      row_offset = 0;
+    }
+    row_offset                  = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+    row_offsets.push_back(row_offset);
+    row_batch_size += row_sizes[row];
+    row_offset += row_sizes[row];
+    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+    total_table_size += row_sizes[row];
+    row_batch_rows++;
+  }
+  if (row_batch_size > 0) {
+    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+  }
+
+  #if defined(DEBUG)
+  printf("%lu batches:\n", row_batches.size());
+  for (auto i = 0; i < (int)row_batches.size(); ++i) {
+    printf("%d: %d rows, ", i, row_batches[i].row_count);
+    pretty_print(row_batches[i].num_bytes);
+    printf("\n");
+  }
+  #endif
+
+  std::vector<detail::block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_size      = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+    int i = 0;
+    while (i < num_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(
+        detail::block_info{start_col,
+                   current_window_start_row,
+                   start_col + end_col,
+                   std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (int col = 0; col < num_columns; ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed  = col_size;  // They are the same for fixed width types
+    auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size;
+
+    if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row
+      current_window_start_col = col;
+    } else {
+      row_size = row_size_with_this_col;
+    }
+  }
+
+  auto validity_offset = detail::align_offset(column_starts.back(), 4);
+  column_starts.push_back(validity_offset);
+  
+  // build last set of blocks
+  if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); }
+
+  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things
+  std::vector<const int8_t *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
+
+  #if defined(DEBUG)
+  printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared mem(");
+  pretty_print(fixed_width_size_per_row);
+  printf("/row, %d columns, %d rows, ", num_columns, num_rows);
+  pretty_print(total_table_size);
+  printf(" total):\n");
+  #endif
+
+  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts   = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  std::vector<rmm::device_buffer> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i=0; i<row_batches.size(); ++i) {
+    output_data.push_back(rmm::device_buffer(row_batches[i].num_bytes, stream, mr));
+  }
+  auto dev_output_data   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  // blast through the entire table and convert it
+  dim3 blocks;
+  dim3 threads;
+  blocks.x  = block_infos.size();
+  blocks.y  = 0;
+  blocks.z  = 0;
+  threads.x = 1024;
+  threads.y = 0;
+  threads.z = 0;
+  detail::copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
+                                                                                num_columns,
+                                                                                dev_input_data.data(),
+                                                                                dev_input_nm.data(),
+                                                                                dev_col_sizes.data(),
+                                                                                dev_col_starts.data(),
+                                                                                dev_block_infos.data(),
+                                                                                dev_row_offsets.data(),
+                                                                                reinterpret_cast<int8_t **>(dev_output_data.data()));
+
+  // split up the output buffer into multiple buffers based on row batch sizes
+  // and create list of byte columns
+  int offset_offset = 0;
+  std::vector<std::unique_ptr<cudf::column>> ret;
+  for (uint i=0; i<row_batches.size(); ++i) {
+  
+    // compute offsets for this row batch
+    std::vector<size_type> offset_vals;
+    offset_vals.reserve(row_batches[i].row_count + 1);
+    size_type cur_offset = 0;
+    offset_vals.push_back(cur_offset);
+    for (int row=0; row<row_batches[i].row_count; ++row) {
+      cur_offset += row_sizes[row + offset_offset];
+      offset_vals.push_back(cur_offset);
+    }
+    offset_offset += row_batches[i].row_count;
+
+    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto offsets =
+      std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
+
+    auto data =
+      std::make_unique<column>(data_type{cudf::type_id::INT8},
+                                row_batches[i].num_bytes,
+                                std::move(output_data[i]));
+
+    ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
+      std::move(offsets),
+      std::move(data),
+      0,
+      rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+      stream,
+      mr));
+  }
+  
+  return ret;
+}
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource *mr)
+{
+  const cudf::size_type num_columns = tbl.num_columns();
+
+  std::vector<cudf::data_type> schema;
+  schema.resize(num_columns);
+  std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
+
+  if (detail::are_all_fixed_width(schema)) {
+    std::vector<cudf::size_type> column_start;
+    std::vector<cudf::size_type> column_size;
+
+    int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
+    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+
+    int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
+    // Make the number of rows per batch a multiple of 32 so we don't have to worry about
+    // splitting validity at a specific row offset.  This might change in the future.
+    max_rows_per_batch = (max_rows_per_batch / 32) * 32;
+
+    cudf::size_type num_rows = tbl.num_rows();
+
+    // Get the pointers to the input columnar data ready
+    std::vector<const int8_t *> input_data;
+    std::vector<cudf::bitmask_type const *> input_nm;
+    for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
+      cudf::column_view cv = tbl.column(column_number);
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+    auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr);
+    auto dev_input_nm   = detail::copy_to_dev_async(input_nm, stream, mr);
+
+    using ScalarType = cudf::scalar_type_t<cudf::size_type>;
+    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    zero->set_valid(true, stream);
+    static_cast<ScalarType *>(zero.get())->set_value(0, stream);
+
+    auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    step->set_valid(true, stream);
+    static_cast<ScalarType *>(step.get())
+      ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
+
+    std::vector<std::unique_ptr<cudf::column>> ret;
+    for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
+      cudf::size_type row_count = num_rows - row_start;
+      row_count                 = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
+      ret.emplace_back(detail::fixed_width_convert_to_rows(row_start,
+                                                           row_count,
+                                                           num_columns,
+                                                           size_per_row,
+                                                           dev_column_start,
+                                                           dev_column_size,
+                                                           dev_input_data,
+                                                           dev_input_nm,
+                                                           *zero,
+                                                           *step,
+                                                           stream,
+                                                           mr));
+    }
+
+    return ret;
+  } else {
+    CUDF_FAIL("Only fixed width types are currently supported");
+  }
+}
+
+std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
+                                               std::vector<cudf::data_type> const &schema,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource *mr)
+{
+  // verify that the types are what we expect
+  cudf::column_view child = input.child();
+  cudf::type_id list_type = child.type().id();
+  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  cudf::size_type num_columns = schema.size();
+
+  if (detail::are_all_fixed_width(schema)) {
+    std::vector<cudf::size_type> column_start;
+    std::vector<cudf::size_type> column_size;
+
+    cudf::size_type num_rows = input.parent().size();
+    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+
+    // Ideally we would check that the offsets are all the same, etc. but for now
+    // this is probably fine
+    CUDF_EXPECTS(size_per_row * num_rows == child.size(),
+                 "The layout of the data appears to be off");
+    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+
+    // Allocate the columns we are going to write into
+    std::vector<std::unique_ptr<cudf::column>> output_columns;
+    std::vector<int8_t *> output_data;
+    std::vector<cudf::bitmask_type *> output_nm;
+    for (cudf::size_type i = 0; i < num_columns; i++) {
+      auto column = cudf::make_fixed_width_column(
+        schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
+      auto mut = column->mutable_view();
+      output_data.emplace_back(mut.data<int8_t>());
+      output_nm.emplace_back(mut.null_mask());
+      output_columns.emplace_back(std::move(column));
+    }
+
+    auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr);
+    auto dev_output_nm   = detail::copy_to_dev_async(output_nm, stream, mr);
+
+    dim3 blocks;
+    dim3 threads;
+    int shared_size =
+      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+
+    detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+      num_rows,
+      num_columns,
+      size_per_row,
+      dev_column_start->data(),
+      dev_column_size->data(),
+      dev_output_data->data(),
+      dev_output_nm->data(),
+      child.data<int8_t>());
+
+    return std::make_unique<cudf::table>(std::move(output_columns));
+  } else {
+    CUDF_FAIL("Only fixed width types are currently supported");
+  }
+}
+
+std::unique_ptr<cudf::table> convert_from_rows(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
+
+  //    for (uint i=0; i<input.size(); ++i) {
+  cudf::lists_column_view lcv = input[0]->view();
+  auto ret                    = convert_from_rows(lcv, schema, stream, mr);
+
+  return ret;
+  //    }
+}
+
+}  // namespace cudf

From 6e869b61c91546175792a95834c7a81f951060fd Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 10 Jun 2021 17:53:09 +0000
Subject: [PATCH 02/80] fixing kernel launch and updating

---
 .../row_conversion/row_conversion.cpp         |   9 +-
 cpp/src/row_conversion/row_conversion.cu      | 105 +++++++++++++-----
 2 files changed, 83 insertions(+), 31 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index c4edee91b3c..9fa05c408e5 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -28,7 +28,7 @@ class RowConversion : public cudf::benchmark {
 static void BM_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
+/*  auto const table = create_random_table({cudf::type_id::INT8,
                                           cudf::type_id::INT32,
                                           cudf::type_id::INT16,
                                           cudf::type_id::INT64,
@@ -38,7 +38,10 @@ static void BM_to_row(benchmark::State& state)
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
                                          50,
-                                         row_count{n_rows});
+                                         row_count{n_rows});*/
+  auto const table = create_random_table({cudf::type_id::INT32},
+  64,
+  row_count{n_rows});
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -98,7 +101,7 @@ static void BM_from_row(benchmark::State& state)
   (::benchmark::State & st) { BM_to_row(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)      \
     ->RangeMultiplier(8)                         \
-    ->Ranges({{1 << 16, 1 << 24}})               \
+    ->Ranges({{1 << 6, 1 << 20}})               \
     ->UseManualTime()                            \
     ->Unit(benchmark::kMillisecond);
 
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index fb5dc4cb38d..994233a0700 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <iostream>
 #include <iterator>
 #include <limits>
@@ -347,14 +348,14 @@ struct block_info {
  * @param output_data pointer to output data
  * 
  */
-__global__ void copy_from_columns(const cudf::size_type num_rows,
-                                  const cudf::size_type num_columns,
+__global__ void copy_from_columns(const size_type num_rows,
+                                  const size_type num_columns,
                                   const int8_t **input_data,
-                                  const cudf::bitmask_type **input_nm,
-                                  const cudf::size_type *col_sizes,
-                                  const cudf::size_type *col_offsets,
+                                  const bitmask_type **input_nm,
+                                  const size_type *col_sizes,
+                                  const size_type *col_offsets,
                                   const block_info *block_infos,
-                                  const uint64_t *row_offsets,
+                                  const size_type *row_offsets,
                                   int8_t **output_data)
 {
   // We are going to copy the data in two passes.
@@ -365,47 +366,92 @@ __global__ void copy_from_columns(const cudf::size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
+  bool debug_print = false;
+  
+  if (debug_print) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("Column Info:\n");
+    for (int i=0; i<num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n", i, input_data[i], col_sizes[i], col_offsets[i]);
+    }
+    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+/*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+      printf("%d: %d\n", i, row_offsets[i]);
+    }*/
+    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+  }
   auto block = block_infos[blockIdx.x];
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
   uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
-
-    printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x);
-
+  if (debug_print) {
+    printf("outputting to offset %lu\n", output_start_offset);
+    printf("dest shim offset is %d\n", dest_shim_offset);
+    printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
+  }
   // each thread is responsible for every threadcount rows of data.
   // the data is copies into shared memory in the final layout.
   auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
+  if (debug_print) {
+    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]);
+    printf("shmem row size %d\n", shmem_row_size);
+    printf("validity offset is %d\n", validity_offset);
+    printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row);
+  }
   for (int col=block.start_col; col<=block.end_col; ++col) {
     /*if (!col_is_variable) */{
       uint64_t col_offset = 0;
       cudf::size_type col_size = col_sizes[col];
       auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      if (debug_print) {
+        printf("dest col offset %d\n", dest_col_offset);
+      }
       for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
-        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * row];
+        if (debug_print) {
+          printf("shmem row %d at offset %d\n", row - block.start_row, (row - block.start_row) * shmem_row_size);
+        }
+        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
         switch (col_size) {
           case 1: {
-            *shmem_dest = input_data[col][row];
+            if (debug_print) {
+              printf("%p <- byte %d\n", shmem_dest, input_data[col][row]);
+            }
+              *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
             const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
+            if (debug_print) {
+              printf("%p <- short %d\n", shmem_dest, short_col_input[row]);
+            }
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
             const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            if (debug_print) {
+              printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]);
+            }
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
             const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
+            if (debug_print) {
+              printf("%p <- long %lu\n", shmem_dest, long_col_input[row]);
+            }
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
-            // TODO this should just not be supported for fixed width columns, but just in case...
+            if (debug_print) {
+                printf("byte for byte copy due to size %d\n", col_size);
+                printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]);
+              }
+              // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
             }
@@ -676,6 +722,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
+  #if defined(DEBUG)
+  size_t free, total;
+  cudaMemGetInfo( &free, &total );
+  printf("%lu/%lu Memory", free, total);
+  #endif
+
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
   // we want a single block to fill up the entire shared memory space available
@@ -692,7 +744,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // windows so the windows can be properly cut around them.
 
   std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
-  std::vector<uint64_t> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> row_offsets; // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
   std::vector<size_type> column_starts; // offset of column inside a row including alignment
   std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
@@ -821,7 +873,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
       block_infos.emplace_back(
         detail::block_info{start_col,
                    current_window_start_row,
-                   start_col + end_col,
+                   end_col,
                    std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
 
       i += window_height;
@@ -889,23 +941,20 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
   auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
-  std::vector<rmm::device_buffer> output_data;
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
   output_data.reserve(row_batches.size());
   for (uint i=0; i<row_batches.size(); ++i) {
-    output_data.push_back(rmm::device_buffer(row_batches[i].num_bytes, stream, mr));
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_output_data   = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks;
-  dim3 threads;
-  blocks.x  = block_infos.size();
-  blocks.y  = 0;
-  blocks.z  = 0;
-  threads.x = 1024;
-  threads.y = 0;
-  threads.z = 0;
-  detail::copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
+  dim3 blocks(block_infos.size());
+  dim3 threads(1024);
+  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
                                                                                 num_columns,
                                                                                 dev_input_data.data(),
                                                                                 dev_input_nm.data(),
@@ -932,14 +981,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);  
     auto offsets =
       std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
     auto data =
       std::make_unique<column>(data_type{cudf::type_id::INT8},
                                 row_batches[i].num_bytes,
-                                std::move(output_data[i]));
+                                std::move(output_buffers[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
       std::move(offsets),

From 2703baf52c60ec74bfecb1a495441380fbf55d39 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 16 Jun 2021 19:25:57 +0000
Subject: [PATCH 03/80] Updates and bug fixing

---
 .../row_conversion/row_conversion.cpp         |  76 ++-
 cpp/src/row_conversion/row_conversion.cu      | 498 ++++++++++++------
 cpp/tests/CMakeLists.txt                      |   4 +
 cpp/tests/row_conversion/row_conversion.cpp   | 110 ++++
 4 files changed, 492 insertions(+), 196 deletions(-)
 create mode 100644 cpp/tests/row_conversion/row_conversion.cpp

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 9fa05c408e5..e1228c9df21 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -25,10 +25,43 @@
 class RowConversion : public cudf::benchmark {
 };
 
-static void BM_to_row(benchmark::State& state)
+static void BM_old_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-/*  auto const table = create_random_table({cudf::type_id::INT8,
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         212,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+    64,
+    row_count{n_rows});*/
+
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    total_bytes += cudf::size_of(t);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto rows = cudf::convert_to_rows(table->view());
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+static void BM_new_to_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
                                           cudf::type_id::INT32,
                                           cudf::type_id::INT16,
                                           cudf::type_id::INT64,
@@ -37,11 +70,11 @@ static void BM_to_row(benchmark::State& state)
                                           cudf::type_id::UINT16,
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
-                                         50,
-                                         row_count{n_rows});*/
-  auto const table = create_random_table({cudf::type_id::INT32},
-  64,
-  row_count{n_rows});
+                                         212,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+    64,
+    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -52,14 +85,13 @@ static void BM_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-//    auto rows = cudf::convert_to_rows(table->view());
     auto new_rows = cudf::convert_to_rows2(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-static void BM_from_row(benchmark::State& state)
+/*static void BM_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -73,9 +105,6 @@ static void BM_from_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          256,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-                                           4,
-                                           row_count{n_rows});*/
 
   std::vector<cudf::data_type> schema;
   cudf::size_type total_bytes = 0;
@@ -94,18 +123,19 @@ static void BM_from_row(benchmark::State& state)
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
-  BENCHMARK_DEFINE_F(RowConversion, name)        \
-  (::benchmark::State & st) { BM_to_row(st); }   \
-  BENCHMARK_REGISTER_F(RowConversion, name)      \
-    ->RangeMultiplier(8)                         \
-    ->Ranges({{1 << 6, 1 << 20}})               \
-    ->UseManualTime()                            \
+}*/
+
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
@@ -116,4 +146,4 @@ TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
+//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 994233a0700..92ba075c316 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -44,7 +44,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-
 /**
  * Copy a simple vector to device memory asynchronously. Be sure to read
  * the data on the same stream as is used to copy it.
@@ -61,10 +60,9 @@ std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &
 }
 
 template <typename T>
-rmm::device_uvector<T> copy_to_dev_async2(
-  const std::vector<T> &input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
+rmm::device_uvector<T> copy_to_dev_async2(const std::vector<T> &input,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource *mr)
 {
   rmm::device_uvector<T> ret(input.size(), stream, mr);
   CUDA_TRY(cudaMemcpyAsync(
@@ -346,7 +344,7 @@ struct block_info {
  * @param block_infos information about the blocks of work
  * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data
- * 
+ *
  */
 __global__ void copy_from_columns(const size_type num_rows,
                                   const size_type num_columns,
@@ -366,92 +364,119 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false;
-  
+  bool debug_print = false;  // blockIdx.x == 70 && threadIdx.x == 448;
+
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
     printf("Column Info:\n");
-    for (int i=0; i<num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n", i, input_data[i], col_sizes[i], col_offsets[i]);
+    for (int i = 0; i < num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n",
+             i,
+             input_data[i],
+             col_sizes[i],
+             col_offsets[i]);
     }
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-/*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-      printf("%d: %d\n", i, row_offsets[i]);
-    }*/
+    /*    printf("Row Offsets:\n");
+        for (int i=0; i<num_rows; ++i) {
+          printf("%d: %d\n", i, row_offsets[i]);
+        }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
-  auto block = block_infos[blockIdx.x];
+  auto block               = block_infos[blockIdx.x];
+  auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
-  uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
+  uint8_t const dest_shim_offset =
+    reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) &
+    7;  // offset for alignment shim in order to match shared memory with final dest
   if (debug_print) {
     printf("outputting to offset %lu\n", output_start_offset);
     printf("dest shim offset is %d\n", dest_shim_offset);
     printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
+    printf("my block is %d,%d -> %d,%d - buffer %d\n",
+           block.start_col,
+           block.start_row,
+           block.end_col,
+           block.end_row,
+           block.buffer_num);
   }
   // each thread is responsible for every threadcount rows of data.
   // the data is copies into shared memory in the final layout.
-  auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
+  auto const real_bytes_in_row =
+    col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
+  auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
+                                           8);  // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
   if (debug_print) {
-    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]);
+    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n",
+           block.end_col,
+           col_offsets[block.end_col],
+           block.end_col,
+           col_sizes[block.end_col],
+           block.start_col,
+           col_offsets[block.start_col]);
     printf("shmem row size %d\n", shmem_row_size);
     printf("validity offset is %d\n", validity_offset);
-    printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row);
+    printf("starting at %d,%d and going to %d, %d\n",
+           block.start_col,
+           block.start_row,
+           block.end_col,
+           block.end_row);
   }
-  for (int col=block.start_col; col<=block.end_col; ++col) {
-    /*if (!col_is_variable) */{
-      uint64_t col_offset = 0;
+  for (int col = block.start_col; col <= block.end_col; ++col) {
+    /*if (!col_is_variable) */ {
+      uint64_t col_offset      = 0;
       cudf::size_type col_size = col_sizes[col];
-      auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
-      if (debug_print) {
-        printf("dest col offset %d\n", dest_col_offset);
-      }
-      for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
+      auto const dest_col_offset =
+        col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
+      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) {
         if (debug_print) {
-          printf("shmem row %d at offset %d\n", row - block.start_row, (row - block.start_row) * shmem_row_size);
+          printf("shmem row %d(%d) at offset %d(%d)\n",
+                 row - block.start_row,
+                 row,
+                 (row - block.start_row) * shmem_row_size,
+                 row * shmem_row_size);
         }
-        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
+        int8_t *shmem_dest =
+          &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
         switch (col_size) {
           case 1: {
-            if (debug_print) {
-              printf("%p <- byte %d\n", shmem_dest, input_data[col][row]);
-            }
-              *shmem_dest = input_data[col][row];
+            if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); }
+            *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
-            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
-            if (debug_print) {
-              printf("%p <- short %d\n", shmem_dest, short_col_input[row]);
-            }
+            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_data[col]);
+            if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); }
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
-            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_data[col]);
             if (debug_print) {
-              printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]);
+              printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]);
             }
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
-            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
-            if (debug_print) {
-              printf("%p <- long %lu\n", shmem_dest, long_col_input[row]);
-            }
+            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_data[col]);
+            if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); }
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
             if (debug_print) {
-                printf("byte for byte copy due to size %d\n", col_size);
-                printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]);
-              }
-              // TODO this should just not be supported for fixed width columns, but just in case...
+              printf("byte for byte copy due to size %d of column %d\n", col_size, col);
+              printf("%p <- input_data[%d] which is %d\n",
+                     shmem_dest,
+                     input_offset,
+                     input_data[col][input_offset]);
+            }
+            // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
             }
@@ -463,11 +488,13 @@ __global__ void copy_from_columns(const size_type num_rows,
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
-        int8_t *valid_byte              = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
+        int8_t *valid_byte =
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
         cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
+        if (debug_print) { printf("Outputting validity to %p\n", valid_byte); }
         // Now copy validity for the column
         if (input_nm[col]) {
           if (bit_is_set(input_nm[col], row)) {
@@ -479,11 +506,11 @@ __global__ void copy_from_columns(const size_type num_rows,
           // It is valid so just set the bit
           atomicOr_block(valid_int, 1 << int_bit_offset);
         }
-      } // end row
+      }  // end row
 
-      col_offset += col_sizes[col] * (block.end_row - block.start_row);
+      col_offset += col_sizes[col] * rows_in_block;
     }
-  } // end col
+  }  // end col
 
   // wait for the data to be totally copied into shared memory
   __syncthreads();
@@ -496,30 +523,75 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride = gridDim.x * 8;
-  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) {
+  auto const thread_stride       = gridDim.x * 8;
+  if (debug_print) {
+    printf("writing final data from %d to %d at stride %d\n",
+           thread_start_offset,
+           shmem_row_size * rows_in_block,
+           thread_stride);
+    printf("rows in block %d\n", rows_in_block);
+  }
+  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block;
+       src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
-    auto const row_offset = row_offsets[block.start_row + output_row_num];
-    auto const col_offset = src_offset % shmem_row_size;
-    int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset];
-    int8_t *input_ptr = &shared_data[src_offset];
-    // the first part and last part of the row is unaligned data copy. This is copied a single byte
-    // at a time.
-    if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
-      // first part of a row, copy single bytes
+    auto const row_offset     = row_offsets[block.start_row + output_row_num];
+    auto const col_offset     = src_offset % shmem_row_size;
+    int8_t *output_ptr        = &output_data[block.buffer_num][row_offset + col_offset];
+    int8_t *input_ptr         = &shared_data[src_offset];
+
+    // three cases to worry about here
+    // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front
+    // 2) last 8-byte part of a large row - some bytes of pad at the end
+    // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front
+    // AND potentially pad at the rear
+
+    // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily.
+    // 1st case is when we're at some even multiple of shmem_row_size offset.
+    // 2nd case is when offset + 8 is some even multiple of shmem_row_size.
+    // must be an 8 byte copy
+
+    // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize?
+    if (real_bytes_in_row + dest_shim_offset <= 8) {
+      // case 3, we want to copy real_bytes_in_row bytes
+      auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf("case 3 - %d single byte final write %p -> %p\n",
+                 num_single_bytes,
+                 &input_ptr[i + dest_shim_offset],
+                 &output_ptr[i]);
+        }
+        output_ptr[i] = input_ptr[i + dest_shim_offset];
+      }
+    } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
+      // first byte with leading pad
       auto const num_single_bytes = 8 - dest_shim_offset;
-      for (auto i=0; i<num_single_bytes; ++i) {
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf(
+            "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]);
+        }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
-    } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) {
-      // last part of a row, copy single bytes
-      auto const num_single_bytes = dest_shim_offset;
-      for (auto i=0; i<num_single_bytes; ++i) {
+    } else if ((src_offset + 8) % shmem_row_size == 0 &&
+               (real_bytes_in_row + dest_shim_offset) % 8 > 0) {
+      // last bytes of a row
+      auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8;
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf("single trailing byte final write %p -> %p\n",
+                 &input_ptr[i + dest_shim_offset],
+                 &output_ptr[i]);
+        }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
     } else {
       // copy 8 bytes aligned
-      const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_ptr);
+      const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_ptr);
+      if (debug_print) {
+        printf(
+          "long final write %p -> %p\n", long_col_input, reinterpret_cast<int64_t *>(output_ptr));
+      }
       *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
     }
   }
@@ -696,13 +768,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
                                                             rmm::cuda_stream_view stream,
                                                             rmm::mr::device_memory_resource *mr)
 {
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough
-  // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes.
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
   constexpr int max_window_height = 1024;
-  const size_type num_columns = tbl.num_columns();
-  const size_type num_rows    = tbl.num_rows();
+  const size_type num_columns     = tbl.num_columns();
+  const size_type num_rows        = tbl.num_rows();
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   auto pretty_print = [](uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
@@ -714,7 +787,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
       printf("%lu Bytes", i);
     }
   };
-  #endif
+#endif
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
@@ -722,11 +795,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   size_t free, total;
-  cudaMemGetInfo( &free, &total );
-  printf("%lu/%lu Memory", free, total);
-  #endif
+  cudaMemGetInfo(&free, &total);
+  printf("%lu/%lu Memory\n", free, total);
+#endif
 
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
@@ -743,45 +816,46 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
-  std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
+  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
-  std::vector<size_type> column_starts; // offset of column inside a row including alignment
-  std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
+  std::vector<size_type> column_starts;  // offset of column inside a row including alignment
+  std::vector<column_view>
+    variable_width_columns;  // list of the variable width columns in the table
   row_sizes.reserve(num_rows);
   row_offsets.reserve(num_rows);
   column_sizes.reserve(num_columns);
-  column_starts.reserve(num_columns+1); // we add a final offset for validity data start
+  column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
   size_type fixed_width_size_per_row = 0;
   for (int col = 0; col < num_columns; ++col) {
-    auto cv = tbl.column(col);
-    auto col_type = cv.type();
+    auto cv          = tbl.column(col);
+    auto col_type    = cv.type();
     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
-    if (nested_type) { variable_width_columns.push_back(cv);}
+    if (nested_type) { variable_width_columns.push_back(cv); }
 
     // a list or string column will write a single uint64
     // of data here for offset/length
     auto col_size = nested_type ? 8 : size_of(col_type);
 
     // align size for this type
-    std::size_t const alignment_needed  = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row                  = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
     column_starts.push_back(fixed_width_size_per_row);
     column_sizes.push_back(col_size);
     fixed_width_size_per_row += col_size;
   }
-  
+
   // When building the columns to return, we have to be mindful of the offset limit in cudf.
   // It is 32-bit and these data columns are capable of surpassing that easily. The data should
   // not be cut off exactly at the limit though due to the validity buffers. The most efficient
   // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
   // we keep track of the cut points for the validity, which we call row batches. If the row
-  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit.
-  // Note that this boundary is for our book-keeping with column pointers and not anything
-  // that the kernel needs to worry about. We cut the output at convienient boundaries
-  // when assembling the outgoing data stream.
+  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+  // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+  // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+  // the outgoing data stream.
   struct row_batch {
     size_type num_bytes;
     size_type row_count;
@@ -798,71 +872,90 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     // will be included in the variable-width data blob at the end of the
     // row.
     return 0;
-/*      auto c = variable_width_columns[col];
-        while (true) {
-          auto col_offsets   = c.child(0).data<size_type>();
-          auto col_data_size = size_of(c.child(1).type());
-          std::size_t alignment_needed  = col_data_size;
-    
-        row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-        if (c.num_children() == 0) {
-          break;
-        }
-        c = c.child(1);
-      }
-*/
+    /*      auto c = variable_width_columns[col];
+            while (true) {
+              auto col_offsets   = c.child(0).data<size_type>();
+              auto col_data_size = size_of(c.child(1).type());
+              std::size_t alignment_needed  = col_data_size;
+
+            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+            if (c.num_children() == 0) {
+              break;
+            }
+            c = c.child(1);
+          }
+    */
   };
 
   uint64_t row_batch_size   = 0;
   uint64_t total_table_size = 0;
-  size_type row_batch_rows = 0;
-  uint64_t row_offset = 0;
+  size_type row_batch_rows  = 0;
+  uint64_t row_offset       = 0;
+
+  auto calculate_validity_size = [](int const num_cols) {
+    // Now we need to add in space for validity
+    // Eventually we can think about nullable vs not nullable, but for now we will just always add
+    // it in
+    return (num_cols + 7) / 8;
+  };
 
-  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate
-  // the size of each row's variable-width data as well.
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
+  // calculate the size of each row's variable-width data and validity as well.
   for (int row = 0; row < num_rows; ++row) {
-    row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row);
-    if (row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+    auto aligned_row_batch_size =
+      detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
+    row_sizes[row] = fixed_width_size_per_row;
+    // validity is byte aligned
+    row_sizes[row] += calculate_validity_size(num_columns);
+    // variable width data is 8-byte aligned
+    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
+                     calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
+
+    if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
-      row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size = 0;
-      row_batch_rows = row_batch_rows & 31;
-      row_offset = 0;
+      row_batches.push_back(
+        row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size         = 0;
+      row_batch_rows         = row_batch_rows & 31;
+      row_offset             = 0;
+      aligned_row_batch_size = 0;
     }
-    row_offset                  = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+    row_offset = detail::align_offset(row_offset, 8);  // rows are 8 byte aligned
     row_offsets.push_back(row_offset);
-    row_batch_size += row_sizes[row];
+    row_batch_size = aligned_row_batch_size + row_sizes[row];
     row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+    total_table_size = detail::align_offset(total_table_size, 8);  // rows are 8 byte aligned
     total_table_size += row_sizes[row];
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
+  printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
     printf("%d: %d rows, ", i, row_batches[i].row_count);
     pretty_print(row_batches[i].num_bytes);
     printf("\n");
   }
-  #endif
+#endif
 
   std::vector<detail::block_info> block_infos;
 
   // block infos are organized with the windows going "down" the columns
   // this provides the most coalescing of memory access
-  int current_window_size      = 0;
+  int current_window_width     = 0;
   int current_window_start_col = 0;
 
   // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) {
+  auto build_blocks = [&block_infos, &row_batches, num_rows](
+                        int const start_col, int const end_col, int const desired_window_height) {
     int current_window_start_row = 0;
     int current_window_row_batch = 0;
-    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-    int i = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int i                        = 0;
     while (i < num_rows) {
       if (rows_left_in_batch == 0) {
         current_window_row_batch++;
@@ -872,9 +965,10 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
       block_infos.emplace_back(
         detail::block_info{start_col,
-                   current_window_start_row,
-                   end_col,
-                   std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
+                           current_window_start_row,
+                           end_col,
+                           std::min(current_window_start_row + window_height - 1, num_rows - 1),
+                           current_window_row_batch});
 
       i += window_height;
       current_window_start_row += window_height;
@@ -882,7 +976,17 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   };
 
-  int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+  int const window_height =
+    std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+#if defined(DEBUG)
+  printf(
+    "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
+    "%d\n",
+    max_window_height,
+    num_rows,
+    row_batches[0].row_count,
+    window_height);
+#endif
 
   int row_size = 0;
 
@@ -891,32 +995,74 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     auto const col_size = column_sizes[col];
 
     // align size for this type
-    std::size_t alignment_needed  = col_size;  // They are the same for fixed width types
-    auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size;
+    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
+    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col  = row_size_aligned + col_size;
 
     if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+#if defined(DEBUG)
+      printf(
+        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "for shared mem size %d\n",
+        row_size_with_this_col * window_height,
+        col,
+        row_size * window_height,
+        current_window_start_col,
+        col - 1,
+        window_height,
+        row_size_with_this_col,
+        row_size,
+        row_size_aligned,
+        shmem_limit_per_block);
+#endif
       // too large, close this window, generate vertical blocks and restart
       build_blocks(current_window_start_col, col - 1, window_height);
-      row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row
+      row_size =
+        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+#if defined(DEBUG)
+      printf(
+        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "or %d)\n",
+        row_size,
+        col_size,
+        row_size + col_size,
+        column_starts[col - 1],
+        column_sizes[col - 1],
+        column_starts[col - 1] + column_sizes[col - 1]);
+#endif
+      row_size += col_size;  // alignment required for shared memory window boundary to match
+                             // alignment of output row
       current_window_start_col = col;
+      current_window_width     = 0;
     } else {
       row_size = row_size_with_this_col;
+      current_window_width++;
     }
   }
 
-  auto validity_offset = detail::align_offset(column_starts.back(), 4);
+#if defined(DEBUG)
+  printf("validity offset will be %d + %d = %d\n",
+         column_starts.back(),
+         column_sizes.back(),
+         column_starts.back() + column_sizes.back());
+#endif
+  auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4);
   column_starts.push_back(validity_offset);
-  
+
   // build last set of blocks
-  if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); }
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
+  }
 
-  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things
+  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while
+  // calculating other things
   std::vector<const int8_t *> input_data;
   std::vector<bitmask_type const *> input_nm;
   for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv = tbl.column(column_number);
+    column_view cv      = tbl.column(column_number);
     auto const col_type = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
     if (!nested_type) {
       input_data.emplace_back(cv.data<int8_t>());
@@ -924,81 +1070,87 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  #if defined(DEBUG)
-  printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row);
+#if defined(DEBUG)
+  printf("%lu windows for %d columns, %d rows to fit in ",
+         block_infos.size(),
+         block_infos[0].end_col - block_infos[0].start_col + 1,
+         block_infos[0].end_row - block_infos[0].start_row);
   pretty_print(shmem_limit_per_block);
   printf(" shared mem(");
   pretty_print(fixed_width_size_per_row);
   printf("/row, %d columns, %d rows, ", num_columns, num_rows);
   pretty_print(total_table_size);
   printf(" total):\n");
-  #endif
+#endif
 
   auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
   auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
   auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts   = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
-  auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
   std::vector<rmm::device_buffer> output_buffers;
   std::vector<int8_t *> output_data;
   output_data.reserve(row_batches.size());
-  for (uint i=0; i<row_batches.size(); ++i) {
+  for (uint i = 0; i < row_batches.size(); ++i) {
     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data   = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(1024);
-  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
-                                                                                num_columns,
-                                                                                dev_input_data.data(),
-                                                                                dev_input_nm.data(),
-                                                                                dev_col_sizes.data(),
-                                                                                dev_col_starts.data(),
-                                                                                dev_block_infos.data(),
-                                                                                dev_row_offsets.data(),
-                                                                                reinterpret_cast<int8_t **>(dev_output_data.data()));
+  dim3 threads(std::min((uint64_t)1024, total_table_size / 8));
+#if defined(DEBUG)
+  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared memory\n");
+#endif
+  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+    num_rows,
+    num_columns,
+    dev_input_data.data(),
+    dev_input_nm.data(),
+    dev_col_sizes.data(),
+    dev_col_starts.data(),
+    dev_block_infos.data(),
+    dev_row_offsets.data(),
+    reinterpret_cast<int8_t **>(dev_output_data.data()));
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
   int offset_offset = 0;
   std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i=0; i<row_batches.size(); ++i) {
-  
+  for (uint i = 0; i < row_batches.size(); ++i) {
     // compute offsets for this row batch
     std::vector<size_type> offset_vals;
     offset_vals.reserve(row_batches[i].row_count + 1);
     size_type cur_offset = 0;
     offset_vals.push_back(cur_offset);
-    for (int row=0; row<row_batches[i].row_count; ++row) {
-      cur_offset += row_sizes[row + offset_offset];
+    for (int row = 0; row < row_batches[i].row_count; ++row) {
+      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
       offset_vals.push_back(cur_offset);
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);  
-    auto offsets =
-      std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
+    auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto offsets     = std::make_unique<column>(
+      data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
-    auto data =
-      std::make_unique<column>(data_type{cudf::type_id::INT8},
-                                row_batches[i].num_bytes,
-                                std::move(output_buffers[i]));
+    auto data = std::make_unique<column>(
+      data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
-      std::move(offsets),
-      std::move(data),
-      0,
-      rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-      stream,
-      mr));
+                                          std::move(offsets),
+                                          std::move(data),
+                                          0,
+                                          rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+                                          stream,
+                                          mr));
   }
-  
+
   return ret;
 }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 088b0b747fb..2da28425c9e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -330,6 +330,10 @@ ConfigureTest(RESHAPE_TEST
     reshape/interleave_columns_tests.cpp
     reshape/tile_tests.cpp)
 
+###################################################################################################
+# - row conversion test ----------------------------------------------------------------------------------
+ConfigureTest(ROW_CONVERSION_TEST row_conversion/row_conversion.cpp)
+
 ###################################################################################################
 # - traits test -----------------------------------------------------------------------------------
 ConfigureTest(TRAITS_TEST types/traits_test.cpp)
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
new file mode 100644
index 00000000000..c02f83ad1d5
--- /dev/null
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <cudf/row_conversion.hpp>
+
+struct ColumnToRowTests : public cudf::test::BaseFixture {
+};
+
+TEST_F(ColumnToRowTests, Single)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Simple)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Tall)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Wide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, SingleByteWide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}

From 63a663697f9af3756e6b907d5a1595f3cdd8127a Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 21 Jun 2021 18:17:45 +0000
Subject: [PATCH 04/80] Updating windows to be generated in a square way so we
 can have more data to write out as 8-byte writes from shared memory. Shuffled
 some of the copy to GPU code up so it can start the copy sooner and hopefully
 won't force stalls. Some bug fixes.

---
 .../row_conversion/row_conversion.cpp         | 15 ++-
 cpp/src/row_conversion/row_conversion.cu      | 96 +++++++++++--------
 2 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index e1228c9df21..d6b195433cf 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -125,7 +125,7 @@ static void BM_new_to_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }*/
 
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
   BENCHMARK_DEFINE_F(RowConversion, name)           \
   (::benchmark::State & st) { f(st); }              \
   BENCHMARK_REGISTER_F(RowConversion, name)         \
@@ -134,8 +134,17 @@ static void BM_new_to_row(benchmark::State& state)
     ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
+    ->Unit(benchmark::kMillisecond);
+
+OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 92ba075c316..3f221e2f716 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -364,7 +364,7 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false;  // blockIdx.x == 70 && threadIdx.x == 448;
+  constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -383,6 +383,7 @@ __global__ void copy_from_columns(const size_type num_rows,
         }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
+  //else { return; }
   auto block               = block_infos[blockIdx.x];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
@@ -416,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            col_sizes[block.end_col],
            block.start_col,
            col_offsets[block.start_col]);
-    printf("shmem row size %d\n", shmem_row_size);
+    printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row);
     printf("validity offset is %d\n", validity_offset);
     printf("starting at %d,%d and going to %d, %d\n",
            block.start_col,
@@ -524,6 +525,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   //
   auto const thread_start_offset = threadIdx.x * 8;
   auto const thread_stride       = gridDim.x * 8;
+  auto const end_offset = shmem_row_size * rows_in_block;
+
   if (debug_print) {
     printf("writing final data from %d to %d at stride %d\n",
            thread_start_offset,
@@ -531,7 +534,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            thread_stride);
     printf("rows in block %d\n", rows_in_block);
   }
-  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block;
+  for (auto src_offset = thread_start_offset; src_offset < end_offset;
        src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
     auto const row_offset     = row_offsets[block.start_row + output_row_num];
@@ -771,7 +774,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
   // Potential optimization for window sizes.
-  constexpr int max_window_height = 1024;
   const size_type num_columns     = tbl.num_columns();
   const size_type num_rows        = tbl.num_rows();
 
@@ -816,6 +818,25 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
+  // Get the pointers to the input columnar data ready
+  std::vector<const int8_t *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  input_data.reserve(num_columns);
+  input_nm.reserve(num_columns);
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv      = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
+
+  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
@@ -847,6 +868,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     fixed_width_size_per_row += col_size;
   }
 
+  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
+
   // When building the columns to return, we have to be mindful of the offset limit in cudf.
   // It is 32-bit and these data columns are capable of surpassing that easily. The data should
   // not be cut off exactly at the limit though due to the validity buffers. The most efficient
@@ -901,17 +925,18 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
+  auto validity_size = calculate_validity_size(num_columns);
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
     row_sizes[row] = fixed_width_size_per_row;
     // validity is byte aligned
-    row_sizes[row] += calculate_validity_size(num_columns);
+    row_sizes[row] += validity_size;
     // variable width data is 8-byte aligned
     row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
                      calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
 
-    if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
       row_batches.push_back(
         row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
@@ -932,7 +957,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-#if defined(DEBUG)
+  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  #if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
@@ -942,6 +969,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
 #endif
 
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i = 0; i < row_batches.size(); ++i) {
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
+  }
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+
   std::vector<detail::block_info> block_infos;
 
   // block infos are organized with the windows going "down" the columns
@@ -976,8 +1013,13 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   };
 
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized
+  // access, but since other blocks will read/write the edges this may not turn out to be overly important.
+  // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size.
+  // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are
+  // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns.
   int const window_height =
-    std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+    std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count);
 #if defined(DEBUG)
   printf(
     "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
@@ -998,20 +1040,21 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     std::size_t alignment_needed = col_size;  // They are the same for fixed width types
     auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
     auto row_size_with_this_col  = row_size_aligned + col_size;
+    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
 
-    if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
 #if defined(DEBUG)
       printf(
         "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
         "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
         "for shared mem size %d\n",
-        row_size_with_this_col * window_height,
+        row_size_with_end_pad * window_height,
         col,
         row_size * window_height,
         current_window_start_col,
         col - 1,
         window_height,
-        row_size_with_this_col,
+        row_size_with_end_pad,
         row_size,
         row_size_aligned,
         shmem_limit_per_block);
@@ -1055,20 +1098,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
   }
 
-  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while
-  // calculating other things
-  std::vector<const int8_t *> input_data;
-  std::vector<bitmask_type const *> input_nm;
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv      = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
 
 #if defined(DEBUG)
   printf("%lu windows for %d columns, %d rows to fit in ",
@@ -1083,26 +1112,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf(" total):\n");
 #endif
 
-  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
-  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
-  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
-
-  std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
-    output_buffers.push_back(std::move(temp));
-  }
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(std::min((uint64_t)1024, total_table_size / 8));
+  dim3 threads(std::min(1024, shmem_limit_per_block / 8));
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
   pretty_print(shmem_limit_per_block);

From b444279cbe7f41d858eb642e9c74c8bd8e9c8d69 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 8 Jul 2021 01:52:36 +0000
Subject: [PATCH 05/80] Adding row to column conversion code. Performance falls
 off a cliff, but starts out reasonably. I haven't looked at this in nsight
 yet.

---
 .../row_conversion/row_conversion.cpp         |  74 +-
 cpp/include/cudf/row_conversion.hpp           |  12 +
 cpp/src/row_conversion/row_conversion.cu      | 759 +++++++++++++-----
 cpp/tests/row_conversion/row_conversion.cpp   | 106 +++
 4 files changed, 748 insertions(+), 203 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index d6b195433cf..7c1f52c5cd6 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -91,7 +91,7 @@ static void BM_new_to_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-/*static void BM_from_row(benchmark::State& state)
+static void BM_old_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -123,36 +123,62 @@ static void BM_new_to_row(benchmark::State& state)
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}*/
-
-#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
-    ->Unit(benchmark::kMillisecond);
+}
+
+static void BM_new_from_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         256,
+                                         row_count{n_rows});
+
+  std::vector<cudf::data_type> schema;
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    schema.push_back(t);
+    total_bytes += cudf::size_of(t);
+  }
+
+  auto rows = cudf::convert_to_rows(table->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto out = cudf::convert_from_rows2(rows, schema);
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
 
-#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)               \
+  (::benchmark::State & st) { f(st); }                  \
+  BENCHMARK_REGISTER_F(RowConversion, name)             \
+    ->RangeMultiplier(8)                                \
+    ->Ranges({{1 << 6, 1 << 20}})                       \
+    ->UseManualTime()                                   \
     ->Unit(benchmark::kMillisecond);
 
-OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
-#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
-  (::benchmark::State & st) { BM_from_row(st); }   \
+  (::benchmark::State & st) { f(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)        \
     ->RangeMultiplier(8)                           \
-    ->Ranges({{1 << 6, 1 << 22}})                  \
+    ->Ranges({{1 << 6, 1 << 20}})                  \
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index f5e2225ad19..282ffa4b0cb 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -48,4 +48,16 @@ std::unique_ptr<cudf::table> convert_from_rows(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::table> convert_from_rows2(
+  cudf::lists_column_view const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows2(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 }  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 3f221e2f716..c0e78a03576 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -30,6 +30,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cudf/row_conversion.hpp>
+#include <tuple>
 #include "cudf/types.hpp"
 #include "rmm/device_buffer.hpp"
 #include "thrust/iterator/counting_iterator.h"
@@ -332,6 +333,20 @@ struct block_info {
   int buffer_num;
 };
 
+// When building the columns to return, we have to be mindful of the offset limit in cudf.
+// It is 32-bit and these data columns are capable of surpassing that easily. The data should
+// not be cut off exactly at the limit though due to the validity buffers. The most efficient
+// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+// we keep track of the cut points for the validity, which we call row batches. If the row
+// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+// the outgoing data stream.
+struct row_batch {
+  size_type num_bytes;
+  size_type row_count;
+};
+
 /**
  * @brief copy data from cudf columns into x format, which is row-based
  *
@@ -364,7 +379,7 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479;
+  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -383,7 +398,7 @@ __global__ void copy_from_columns(const size_type num_rows,
         }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
-  //else { return; }
+  // else { return; }
   auto block               = block_infos[blockIdx.x];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
@@ -403,7 +418,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            block.buffer_num);
   }
   // each thread is responsible for every threadcount rows of data.
-  // the data is copies into shared memory in the final layout.
+  // the data is copied into shared memory in the final layout.
   auto const real_bytes_in_row =
     col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
   auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
@@ -432,7 +447,7 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const dest_col_offset =
         col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
       if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
-      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) {
+      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
         if (debug_print) {
           printf("shmem row %d(%d) at offset %d(%d)\n",
                  row - block.start_row,
@@ -524,8 +539,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride       = gridDim.x * 8;
-  auto const end_offset = shmem_row_size * rows_in_block;
+  auto const thread_stride       = blockDim.x * 8;
+  auto const end_offset          = shmem_row_size * rows_in_block;
 
   if (debug_print) {
     printf("writing final data from %d to %d at stride %d\n",
@@ -559,9 +574,10 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
       for (auto i = 0; i < num_single_bytes; ++i) {
         if (debug_print) {
-          printf("case 3 - %d single byte final write %p -> %p\n",
+          printf("case 3 - %d single byte final write %p(%d) -> %p\n",
                  num_single_bytes,
                  &input_ptr[i + dest_shim_offset],
+                 input_ptr[i + dest_shim_offset],
                  &output_ptr[i]);
         }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
@@ -600,6 +616,237 @@ __global__ void copy_from_columns(const size_type num_rows,
   }
 }
 
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param offsets
+ * @param output_data
+ * @param output_nm
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_to_columns(const size_type num_rows,
+                                const size_type num_columns,
+                                const size_type *offsets,
+                                int8_t **output_data,
+                                cudf::bitmask_type **output_nm,
+                                const size_type *col_sizes,
+                                const size_type *col_offsets,
+                                const block_info *block_infos,
+                                const int8_t *input_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0;
+
+  if (debug_print) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("Column Info:\n");
+    for (int i = 0; i < num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n",
+             i,
+             output_data[i],
+             col_sizes[i],
+             col_offsets[i]);
+    }
+    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+    /*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+    printf("%d: %d\n", i, row_offsets[i]);
+    }*/
+    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+  }
+//  else { return; }
+  auto block               = block_infos[blockIdx.x];
+  auto const rows_in_block = block.end_row - block.start_row + 1;
+  auto const cols_in_block = block.end_col - block.start_col + 1;
+  extern __shared__ int8_t shared_data[];
+
+  // copy data from our block's window to shared memory
+  // offsets information can get us on the row, then we need to know where the column
+  // starts to offset into the row data.
+
+  // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding
+  // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the
+  // thread may copy less than 8 bytes. Even if at the beginning of the window, because
+  // every internal copy is aligned to 8-byte boundaries.
+  //
+  //  thread 0 thread 1 thread 2 thread 3 thread 4 thread 5
+  //  01234567 89abcdef 01234567 89abcdef 01234567 89abcdef
+  //  xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx
+  // |        |        |        |        |        |        |
+  //
+  //
+
+  auto const window_start_quad = col_offsets[block.start_col] / 8;
+  auto const window_end_quad   = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8;
+  auto const window_quad_width = window_end_quad - window_start_quad;
+  auto const total_quads       = window_quad_width * rows_in_block;
+  auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7;
+
+  if (debug_print) {
+    printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]);
+    printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad);
+    printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad);
+  }
+
+  // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't
+  // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will
+  // result in the window edges being duplicated across blocks, but we can copy the padding as well
+  // to speed up our transfers to shared memory.
+  for (int i = threadIdx.x; i < total_quads; i += blockDim.x) {
+    auto const relative_row = i / window_quad_width;
+    auto const absolute_row = relative_row + block.start_row;
+    //auto const row           = i / window_quad_width;
+    auto const offset_in_row = i % window_quad_width * 8;
+    auto const shmem_dest    = &shared_data[i * 8];
+
+    if (debug_print) {
+      printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest);
+      printf("offsets is %p\n", offsets);
+      printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]);
+      printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row);
+    }
+
+    // full 8-byte copy
+    const int64_t *long_col_input =
+      reinterpret_cast<const int64_t *>(&input_data[offsets[absolute_row] + offset_in_row]);
+    if (debug_print) { 
+      printf("which will be address %p\n", long_col_input);
+      printf("%p <- long %lu\n", shmem_dest, *long_col_input); }
+    *reinterpret_cast<int64_t *>(shmem_dest) = *long_col_input;
+  }
+
+  __syncthreads();
+
+  // now we copy from shared memory to final destination.
+  // the data is laid out in rows in shared memory, so the reads
+  // for a column will be "vertical". Because of this and the different
+  // sizes for each column, this portion is handled on row/column basis.
+  // to prevent each thread working on a single row and also to ensure
+  // that all threads can do work in the case of more threads than rows,
+  // we do a global index instead of a double for loop with col/row.
+  for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+    auto const relative_col = index % cols_in_block;
+    auto const relative_row = index / cols_in_block;
+    auto const absolute_col = relative_col + block.start_col;
+    auto const absolute_row = relative_row + block.start_row;
+
+    auto const shared_memory_row_offset = window_quad_width * 8 * relative_row;
+    auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] +
+                                      shared_memory_row_offset + shared_memory_starting_pad;
+    auto const column_size = col_sizes[absolute_col];
+
+    int8_t *shmem_src = &shared_data[shared_memory_offset];
+    int8_t *dst       = &output_data[absolute_col][absolute_row * column_size];
+
+    if (debug_print) {
+      printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
+      " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size,
+    shmem_src, dst) ;
+    }
+    switch (column_size) {
+      case 1: {
+        if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); }
+        *dst = *shmem_src;
+        break;
+      }
+      case 2: {
+        const int16_t *short_col_input = reinterpret_cast<const int16_t *>(shmem_src);
+        if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); }
+        *reinterpret_cast<int16_t *>(dst) = *short_col_input;
+        break;
+      }
+      case 4: {
+        const int32_t *int_col_input = reinterpret_cast<const int32_t *>(shmem_src);
+        if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); }
+        *reinterpret_cast<int32_t *>(dst) = *int_col_input;
+        break;
+      }
+      case 8: {
+        const int64_t *long_col_input = reinterpret_cast<const int64_t *>(shmem_src);
+        if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); }
+        *reinterpret_cast<int64_t *>(dst) = *long_col_input;
+        break;
+      }
+      default: {
+        if (debug_print) {
+          printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col);
+        }
+        // TODO this should just not be supported for fixed width columns, but just in case...
+        for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; }
+        break;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  // now handle validity. Each thread is responsible for 32 rows in a single column.
+  // to prevent indexing issues with a large number of threads, this is compressed
+  // to a single loop like above. TODO: investigate using shared memory here
+  auto const validity_batches_per_col = (num_rows + 31) / 32;
+  auto const validity_batches_total   = validity_batches_per_col * num_columns;
+  if (debug_print) {
+    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows);
+  }
+  for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) {
+    // what column is this?
+    auto const col             = index / validity_batches_per_col;
+    auto const batch           = index % validity_batches_per_col;
+    auto const starting_row    = batch * 32;
+    auto const validity_offset = col_offsets[num_columns] + col / 8;
+
+    if (debug_print) {
+      printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
+    }
+
+    int32_t dst_validity = 0;
+    for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
+      int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
+
+      if (debug_print) {
+        printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset);
+      }
+  
+      auto const val_byte     = *validity_ptr;
+      auto const src_shift    = col % 8;
+      auto const dst_shift    = row % 32;
+      auto const src_bit_mask = 1 << src_shift;
+      if (debug_print) {
+        printf("src bit mask is 0x%x\n", src_bit_mask);
+        printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift);
+        printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift);
+      }
+//      auto const dst_bit_mask = 1 << dst_shift;
+      dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
+      if (debug_print) {
+        printf("validity is now 0x%x\n", dst_validity);
+      }
+    }
+    
+
+    int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[col] + (starting_row / 32));
+    if (debug_print) {
+      printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32));
+      printf("validity to write is %d\n", dst_validity);
+      printf("validity write %p <- %d\n", validity_ptr, dst_validity);
+    }
+    *validity_ptr         = dst_validity;
+  }
+}
+
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
  * @param [in] num_columns the number of columns being copied.
@@ -764,21 +1011,165 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   return align_offset(at_offset, 8);  // 8 bytes (64 bits)
 }
 
-}  // namespace detail
+template <typename iterator>
+static size_type compute_column_information(
+  iterator begin,
+  iterator end,
+  std::vector<size_type> &column_starts,
+  std::vector<size_type> &column_sizes)//,
+  //std::function<void(T)> nested_type_cb)
+{
+  size_type fixed_width_size_per_row = 0;
+  for (auto cv = begin; cv != end; ++cv) {
+    auto col_type    = std::get<0>(*cv);
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+//    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+
+  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  column_starts.push_back(validity_offset);
+
+  return fixed_width_size_per_row;
+}
 
 //#define DEBUG
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr)
+
+static std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
+                                                 std::vector<size_type> const &column_starts,
+                                                 std::vector<row_batch> const &row_batches,
+                                                 size_type const total_number_of_rows,
+                                                 size_type const &shmem_limit_per_block)
 {
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
-  const size_type num_columns     = tbl.num_columns();
-  const size_type num_rows        = tbl.num_rows();
+  std::vector<block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_width     = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
+                        int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int i                        = 0;
+    while (i < total_number_of_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(detail::block_info{
+        start_col,
+        current_window_start_row,
+        end_col,
+        std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
+        current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+  // would be memory cache line sized access, but since other blocks will read/write the edges this
+  // may not turn out to be overly important. For now, we will attempt to build a square window as
+  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
+  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
+  // bytes, not rows or columns.
+  int const window_height = std::min(
+    std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows),
+    row_batches[0].row_count);
+#if defined(DEBUG)
+  printf(
+    "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height "
+    "%d\n",
+    size_type(sqrt(shmem_limit_per_block)) / column_sizes[0],
+    total_number_of_rows,
+    row_batches[0].row_count,
+    window_height);
+#endif
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
+    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col  = row_size_aligned + col_size;
+    auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
+
+    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
+#if defined(DEBUG)
+      printf(
+        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "for shared mem size %d\n",
+        row_size_with_end_pad * window_height,
+        col,
+        row_size * window_height,
+        current_window_start_col,
+        col - 1,
+        window_height,
+        row_size_with_end_pad,
+        row_size,
+        row_size_aligned,
+        shmem_limit_per_block);
+#endif
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size =
+        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+#if defined(DEBUG)
+      printf(
+        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "or %d)\n",
+        row_size,
+        col_size,
+        row_size + col_size,
+        column_starts[col - 1],
+        column_sizes[col - 1],
+        column_starts[col - 1] + column_sizes[col - 1]);
+#endif
+      row_size += col_size;  // alignment required for shared memory window boundary to match
+                             // alignment of output row
+      current_window_start_col = col;
+      current_window_width     = 0;
+    } else {
+      row_size = row_size_with_this_col;
+      current_window_width++;
+    }
+  }
+
+  // build last set of blocks
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height);
+  }
+
+  return block_infos;
+}
+}  // namespace detail
 
 #if defined(DEBUG)
-  auto pretty_print = [](uint64_t i) {
+  void pretty_print(uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
     } else if (i > (1 * 1024 * 1024)) {
@@ -788,9 +1179,19 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     } else {
       printf("%lu Bytes", i);
     }
-  };
+  }
 #endif
 
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr)
+{
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows    = tbl.num_rows();
+
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
@@ -834,8 +1235,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm   = detail::copy_to_dev_async2(input_nm, stream, mr);
 
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
@@ -848,43 +1249,48 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   column_sizes.reserve(num_columns);
   column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
-  size_type fixed_width_size_per_row = 0;
-  for (int col = 0; col < num_columns; ++col) {
-    auto cv          = tbl.column(col);
-    auto col_type    = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+    return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+  });
+
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+    iter,
+    iter + num_columns,
+    column_starts,
+    column_sizes);//,
+//    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
+  /*  size_type fixed_width_size_per_row = 0;
+    for (int col = 0; col < num_columns; ++col) {
+      auto cv          = tbl.column(col);
+      auto col_type    = cv.type();
+      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+      if (nested_type) { variable_width_columns.push_back(cv); }
+
+      // a list or string column will write a single uint64
+      // of data here for offset/length
+      auto col_size = nested_type ? 8 : size_of(col_type);
+
+      // align size for this type
+      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+      column_starts.push_back(fixed_width_size_per_row);
+      column_sizes.push_back(col_size);
+      fixed_width_size_per_row += col_size;
+    }*/
 
-    if (nested_type) { variable_width_columns.push_back(cv); }
+#if defined(DEBUG)
+  printf("validity offset will be %d + %d = %d\n",
+         column_starts.back(),
+         column_sizes.back(),
+         column_starts.back() + column_sizes.back());
+#endif
 
-    // a list or string column will write a single uint64
-    // of data here for offset/length
-    auto col_size = nested_type ? 8 : size_of(col_type);
 
-    // align size for this type
-    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-    column_starts.push_back(fixed_width_size_per_row);
-    column_sizes.push_back(col_size);
-    fixed_width_size_per_row += col_size;
-  }
+  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
 
-  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
-
-  // When building the columns to return, we have to be mindful of the offset limit in cudf.
-  // It is 32-bit and these data columns are capable of surpassing that easily. The data should
-  // not be cut off exactly at the limit though due to the validity buffers. The most efficient
-  // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-  // we keep track of the cut points for the validity, which we call row batches. If the row
-  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-  // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-  // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-  // the outgoing data stream.
-  struct row_batch {
-    size_type num_bytes;
-    size_type row_count;
-  };
-  std::vector<row_batch> row_batches;
+  std::vector<detail::row_batch> row_batches;
 
   auto calculate_variable_width_row_data_size = [](int const row) {
     // each level of variable-width data will add an offset/length
@@ -936,10 +1342,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
                      calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
 
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits<size_type>::max()) {
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
+        (uint64_t)std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
       row_batches.push_back(
-        row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
       row_batch_size         = 0;
       row_batch_rows         = row_batch_rows & 31;
       row_offset             = 0;
@@ -954,12 +1361,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+    row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
   auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
@@ -979,125 +1386,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
   auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
-  std::vector<detail::block_info> block_infos;
-
-  // block infos are organized with the windows going "down" the columns
-  // this provides the most coalescing of memory access
-  int current_window_width     = 0;
-  int current_window_start_col = 0;
-
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, num_rows](
-                        int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int i                        = 0;
-    while (i < num_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(
-        detail::block_info{start_col,
-                           current_window_start_row,
-                           end_col,
-                           std::min(current_window_start_row + window_height - 1, num_rows - 1),
-                           current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
-  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized
-  // access, but since other blocks will read/write the edges this may not turn out to be overly important.
-  // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size.
-  // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are
-  // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns.
-  int const window_height =
-    std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count);
-#if defined(DEBUG)
-  printf(
-    "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
-    "%d\n",
-    max_window_height,
-    num_rows,
-    row_batches[0].row_count,
-    window_height);
-#endif
-
-  int row_size = 0;
-
-  // march each column and build the blocks of appropriate sizes
-  for (int col = 0; col < num_columns; ++col) {
-    auto const col_size = column_sizes[col];
-
-    // align size for this type
-    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
-    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
-    auto row_size_with_this_col  = row_size_aligned + col_size;
-    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
-
-    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
-#if defined(DEBUG)
-      printf(
-        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
-        "for shared mem size %d\n",
-        row_size_with_end_pad * window_height,
-        col,
-        row_size * window_height,
-        current_window_start_col,
-        col - 1,
-        window_height,
-        row_size_with_end_pad,
-        row_size,
-        row_size_aligned,
-        shmem_limit_per_block);
-#endif
-      // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
-      row_size =
-        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-#if defined(DEBUG)
-      printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
-        "or %d)\n",
-        row_size,
-        col_size,
-        row_size + col_size,
-        column_starts[col - 1],
-        column_sizes[col - 1],
-        column_starts[col - 1] + column_sizes[col - 1]);
-#endif
-      row_size += col_size;  // alignment required for shared memory window boundary to match
-                             // alignment of output row
-      current_window_start_col = col;
-      current_window_width     = 0;
-    } else {
-      row_size = row_size_with_this_col;
-      current_window_width++;
-    }
-  }
-
-#if defined(DEBUG)
-  printf("validity offset will be %d + %d = %d\n",
-         column_starts.back(),
-         column_sizes.back(),
-         column_starts.back() + column_sizes.back());
-#endif
-  auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4);
-  column_starts.push_back(validity_offset);
-
-  // build last set of blocks
-  if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
-  }
-
+  std::vector<detail::block_info> block_infos =
+    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
 #if defined(DEBUG)
   printf("%lu windows for %d columns, %d rows to fit in ",
@@ -1116,7 +1406,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(std::min(1024, shmem_limit_per_block / 8));
+  #if defined(DEBUG) || 1
+  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size));
+  #else
+  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size));
+  #endif
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
   pretty_print(shmem_limit_per_block);
@@ -1206,11 +1500,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    zero->set_valid(true, stream);
+    zero->set_valid_async(true, stream);
     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
 
     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    step->set_valid(true, stream);
+    step->set_valid_async(true, stream);
     static_cast<ScalarType *>(step.get())
       ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
 
@@ -1238,6 +1532,97 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   }
 }
 
+std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &input,
+                                                std::vector<cudf::data_type> const &schema,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource *mr)
+{
+  // verify that the types are what we expect
+  cudf::column_view child = input.child();
+  cudf::type_id list_type = child.type().id();
+  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  cudf::size_type num_columns = schema.size();
+  cudf::size_type num_rows    = input.parent().size();
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int shmem_limit_per_block;
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  std::vector<cudf::size_type> column_starts;
+  std::vector<cudf::size_type> column_sizes;
+
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
+    return std::make_tuple(schema[i], nullptr);
+  });
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+    iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
+
+  size_type validity_size = (num_columns + 7) / 8;
+
+  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
+
+  // Ideally we would check that the offsets are all the same, etc. but for now
+  // this is probably fine
+  CUDF_EXPECTS(row_size * num_rows == child.size(),
+               "The layout of the data appears to be off");
+  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+
+  // build the row_batches from the passed in list column
+  std::vector<detail::row_batch> row_batches;
+
+  row_batches.push_back(detail::row_batch{child.size(), num_rows});
+
+  // Allocate the columns we are going to write into
+  std::vector<std::unique_ptr<cudf::column>> output_columns;
+  std::vector<int8_t *> output_data;
+  std::vector<cudf::bitmask_type *> output_nm;
+  for (cudf::size_type i = 0; i < num_columns; i++) {
+    auto column = cudf::make_fixed_width_column(
+      schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
+    auto mut = column->mutable_view();
+    output_data.emplace_back(mut.data<int8_t>());
+    output_nm.emplace_back(mut.null_mask());
+    output_columns.emplace_back(std::move(column));
+  }
+
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_nm   = detail::copy_to_dev_async2(output_nm, stream, mr);
+
+  std::vector<detail::block_info> block_infos =
+    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+
+  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+
+  dim3 blocks(block_infos.size());
+  #if defined(DEBUG) || 1
+  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
+  #else
+  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size()));
+  #endif
+#if defined(DEBUG)
+  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared memory\n");
+#endif
+  detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+    num_rows,
+    num_columns,
+    input.offsets().data<size_type>(),
+    dev_output_data.data(),
+    dev_output_nm.data(),
+    dev_col_sizes.data(),
+    dev_col_starts.data(),
+    dev_block_infos.data(),
+    child.data<int8_t>());
+
+  return std::make_unique<cudf::table>(std::move(output_columns));
+}
+
 std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
                                                std::vector<cudf::data_type> const &schema,
                                                rmm::cuda_stream_view stream,
@@ -1318,4 +1703,20 @@ std::unique_ptr<cudf::table> convert_from_rows(
   //    }
 }
 
+std::unique_ptr<cudf::table> convert_from_rows2(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
+
+  //    for (uint i=0; i<input.size(); ++i) {
+  cudf::lists_column_view lcv = input[0]->view();
+  auto ret                    = convert_from_rows2(lcv, schema, stream, mr);
+
+  return ret;
+  //    }
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index c02f83ad1d5..818d7a89ddb 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -21,9 +21,13 @@
 
 #include <thrust/iterator/counting_iterator.h>
 #include <cudf/row_conversion.hpp>
+#include "cudf/lists/lists_column_view.hpp"
+#include "cudf/types.hpp"
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {
 };
+struct RowToColumnTests : public cudf::test::BaseFixture {
+};
 
 TEST_F(ColumnToRowTests, Single)
 {
@@ -108,3 +112,105 @@ TEST_F(ColumnToRowTests, SingleByteWide)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
 }
+
+TEST_F(RowToColumnTests, Single)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Simple)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Tall)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Wide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, SingleByteWide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}

From fbe5dd5fe7c58527888a3fc5c9baba3ed19e97ee Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 8 Jul 2021 20:45:18 +0000
Subject: [PATCH 06/80] updating to use make_device_uvector_async and bitmask
 functions per review comments

---
 cpp/src/row_conversion/row_conversion.cu | 125 +++++++++--------------
 1 file changed, 47 insertions(+), 78 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index c0e78a03576..c73e967cf0f 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -21,6 +21,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -36,6 +37,7 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
+using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
 namespace detail {
@@ -45,32 +47,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-/**
- * Copy a simple vector to device memory asynchronously. Be sure to read
- * the data on the same stream as is used to copy it.
- */
-template <typename T>
-std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
-                                                          rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource *mr)
-{
-  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
-  CUDA_TRY(cudaMemcpyAsync(
-    ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
-  return ret;
-}
-
-template <typename T>
-rmm::device_uvector<T> copy_to_dev_async2(const std::vector<T> &input,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource *mr)
-{
-  rmm::device_uvector<T> ret(input.size(), stream, mr);
-  CUDA_TRY(cudaMemcpyAsync(
-    ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
-  return ret;
-}
-
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type num_columns,
                                             const cudf::size_type row_size,
@@ -180,8 +156,8 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         }
 
         cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
+        cudf::size_type byte_bit_offset = intra_word_index(col_index);
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
         if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
@@ -278,8 +254,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
+        cudf::size_type byte_bit_offset = intra_word_index(col_index);
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -505,8 +481,8 @@ __global__ void copy_from_columns(const size_type num_rows,
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
         int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
-        cudf::size_type byte_bit_offset = col % 8;
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)];
+        cudf::size_type byte_bit_offset = intra_word_index(col);
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -648,7 +624,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0;
+  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -806,7 +782,7 @@ __global__ void copy_to_columns(const size_type num_rows,
     auto const col             = index / validity_batches_per_col;
     auto const batch           = index % validity_batches_per_col;
     auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + col / 8;
+    auto const validity_offset = col_offsets[num_columns] + word_index(col);
 
     if (debug_print) {
       printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
@@ -821,7 +797,7 @@ __global__ void copy_to_columns(const size_type num_rows,
       }
   
       auto const val_byte     = *validity_ptr;
-      auto const src_shift    = col % 8;
+      auto const src_shift    = intra_word_index(col);
       auto const dst_shift    = row % 32;
       auto const src_bit_mask = 1 << src_shift;
       if (debug_print) {
@@ -920,10 +896,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   const cudf::size_type num_rows,
   const cudf::size_type num_columns,
   const cudf::size_type size_per_row,
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
-  std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
-  std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
+  rmm::device_uvector<cudf::size_type> &column_start,
+  rmm::device_uvector<cudf::size_type> &column_size,
+  rmm::device_uvector<const int8_t *> &input_data,
+  rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
   const cudf::scalar &zero,
   const cudf::scalar &scalar_size_per_row,
   rmm::cuda_stream_view stream,
@@ -954,10 +930,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
     num_rows,
     num_columns,
     size_per_row,
-    column_start->data(),
-    column_size->data(),
-    input_data->data(),
-    input_nm->data(),
+    column_start.data(),
+    column_size.data(),
+    input_data.data(),
+    input_nm.data(),
     data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows,
@@ -1004,7 +980,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  int32_t validity_bytes_needed = word_index(schema.size() + 7);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1235,8 +1211,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm   = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+  auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
@@ -1287,8 +1263,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 #endif
 
 
-  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
   std::vector<detail::row_batch> row_batches;
 
@@ -1322,16 +1298,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   size_type row_batch_rows  = 0;
   uint64_t row_offset       = 0;
 
-  auto calculate_validity_size = [](int const num_cols) {
-    // Now we need to add in space for validity
-    // Eventually we can think about nullable vs not nullable, but for now we will just always add
-    // it in
-    return (num_cols + 7) / 8;
-  };
-
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = calculate_validity_size(num_columns);
+  auto validity_size = num_bitmask_words(num_columns);
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -1364,7 +1333,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
 
 #if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
@@ -1384,7 +1353,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   std::vector<detail::block_info> block_infos =
     build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
@@ -1402,7 +1371,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf(" total):\n");
 #endif
 
-  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
@@ -1443,7 +1412,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
     auto offsets     = std::make_unique<column>(
       data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
@@ -1477,8 +1446,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     std::vector<cudf::size_type> column_size;
 
     int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
+    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
 
     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
@@ -1495,8 +1464,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
-    auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr);
-    auto dev_input_nm   = detail::copy_to_dev_async(input_nm, stream, mr);
+    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
@@ -1561,7 +1530,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   size_type fixed_width_size_per_row = detail::compute_column_information(
     iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
 
-  size_type validity_size = (num_columns + 7) / 8;
+  size_type validity_size = num_bitmask_words(num_columns);
 
   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
 
@@ -1569,8 +1538,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   // this is probably fine
   CUDF_EXPECTS(row_size * num_rows == child.size(),
                "The layout of the data appears to be off");
-  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
-  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
@@ -1590,13 +1559,13 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
     output_columns.emplace_back(std::move(column));
   }
 
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
-  auto dev_output_nm   = detail::copy_to_dev_async2(output_nm, stream, mr);
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+  auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
   std::vector<detail::block_info> block_infos =
     build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
-  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   dim3 blocks(block_infos.size());
   #if defined(DEBUG) || 1
@@ -1647,8 +1616,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     // this is probably fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream);
+    auto dev_column_size = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -1663,8 +1632,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       output_columns.emplace_back(std::move(column));
     }
 
-    auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr);
-    auto dev_output_nm   = detail::copy_to_dev_async(output_nm, stream, mr);
+    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
@@ -1675,10 +1644,10 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       num_rows,
       num_columns,
       size_per_row,
-      dev_column_start->data(),
-      dev_column_size->data(),
-      dev_output_data->data(),
-      dev_output_nm->data(),
+      dev_column_start.data(),
+      dev_column_size.data(),
+      dev_output_data.data(),
+      dev_output_nm.data(),
       child.data<int8_t>());
 
     return std::make_unique<cudf::table>(std::move(output_columns));

From f8bc01fa175a44fed79645f4c39c6e0944acfb6e Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 13 Jul 2021 07:18:49 +0000
Subject: [PATCH 07/80] updating conversion code. Found out bit operations are
 on 32-bit values, so they can't be used since row data has byte-aligned
 validity. Performance improvements on the row to column side.

---
 cpp/src/row_conversion/row_conversion.cu | 106 ++++++++++++-----------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index c73e967cf0f..0879a1c50a5 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -37,6 +37,8 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
+#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2)
+
 using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
@@ -156,11 +158,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         }
 
         cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
-        cudf::size_type byte_bit_offset = intra_word_index(col_index);
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
+        if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; }
       }  // end column loop
     }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
@@ -254,8 +256,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
-        cudf::size_type byte_bit_offset = intra_word_index(col_index);
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -481,8 +483,8 @@ __global__ void copy_from_columns(const size_type num_rows,
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
         int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)];
-        cudf::size_type byte_bit_offset = intra_word_index(col);
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col  / 8)];
+        cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -597,6 +599,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  *
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
  * @param offsets
  * @param output_data
  * @param output_nm
@@ -608,6 +611,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  */
 __global__ void copy_to_columns(const size_type num_rows,
                                 const size_type num_columns,
+                                const size_type shmem_used_per_block,
                                 const size_type *offsets,
                                 int8_t **output_data,
                                 cudf::bitmask_type **output_nm,
@@ -624,18 +628,10 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
+  constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("Column Info:\n");
-    for (int i = 0; i < num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n",
-             i,
-             output_data[i],
-             col_sizes[i],
-             col_offsets[i]);
-    }
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
     /*    printf("Row Offsets:\n");
     for (int i=0; i<num_rows; ++i) {
@@ -644,7 +640,13 @@ __global__ void copy_to_columns(const size_type num_rows,
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
 //  else { return; }
-  auto block               = block_infos[blockIdx.x];
+
+  for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) {
+    auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset;
+    if (this_block_index > blockDim.x) {
+      break;
+    }
+    auto block               = block_infos[this_block_index];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   auto const cols_in_block = block.end_col - block.start_col + 1;
   extern __shared__ int8_t shared_data[];
@@ -767,61 +769,58 @@ __global__ void copy_to_columns(const size_type num_rows,
     }
   }
 
-  __syncthreads();
-
-  // now handle validity. Each thread is responsible for 32 rows in a single column.
+  // now handle validity. Each thread is responsible for 32 rows in 8 columns.
   // to prevent indexing issues with a large number of threads, this is compressed
   // to a single loop like above. TODO: investigate using shared memory here
   auto const validity_batches_per_col = (num_rows + 31) / 32;
-  auto const validity_batches_total   = validity_batches_per_col * num_columns;
-  if (debug_print) {
-    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows);
+  auto const validity_batches_total   = std::max(1, validity_batches_per_col * (num_columns / 8));
+  if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) {
+    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x);
   }
-  for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) {
-    // what column is this?
-    auto const col             = index / validity_batches_per_col;
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) {
+    auto const start_col       = (index * 8) / validity_batches_per_col;
     auto const batch           = index % validity_batches_per_col;
     auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + word_index(col);
+    auto const validity_offset = col_offsets[num_columns] + (start_col / 8);
 
     if (debug_print) {
-      printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
+      printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x);
     }
 
-    int32_t dst_validity = 0;
+    // one for each column
+    int32_t dst_validity[8] = {0};
     for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
       int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
 
       if (debug_print) {
-        printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset);
+        printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row);
       }
   
       auto const val_byte     = *validity_ptr;
-      auto const src_shift    = intra_word_index(col);
-      auto const dst_shift    = row % 32;
-      auto const src_bit_mask = 1 << src_shift;
-      if (debug_print) {
-        printf("src bit mask is 0x%x\n", src_bit_mask);
-        printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift);
-        printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift);
-      }
-//      auto const dst_bit_mask = 1 << dst_shift;
-      dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
-      if (debug_print) {
-        printf("validity is now 0x%x\n", dst_validity);
+
+      for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
+        auto const src_shift    = (start_col + i) % 8;
+        auto const dst_shift    = row % 32;
+        auto const src_bit_mask = 1 << src_shift;
+        if (debug_print) {
+          printf("%d-%d: src bit mask is 0x%x, src shift is 0x%x and dst shift is 0x%x, validity bit is 0x%x\n", threadIdx.x, blockIdx.x, src_bit_mask, src_shift, dst_shift, (val_byte & src_bit_mask) >> src_shift);
+        }
+  //      auto const dst_bit_mask = 1 << dst_shift;
+        dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
       }
     }
     
 
-    int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[col] + (starting_row / 32));
-    if (debug_print) {
-      printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32));
-      printf("validity to write is %d\n", dst_validity);
-      printf("validity write %p <- %d\n", validity_ptr, dst_validity);
+    for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
+      int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[start_col + i] + (starting_row / 32));
+      if (debug_print) {
+        printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]);
+      }
+      *validity_ptr         = dst_validity[i];
     }
-    *validity_ptr         = dst_validity;
   }
 }
+}
 
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
@@ -980,7 +979,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = word_index(schema.size() + 7);
+  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1300,7 +1299,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = num_bitmask_words(num_columns);
+  auto validity_size = num_bitmask_words(num_columns) * 4;
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -1521,6 +1520,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
+  shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS;
+
   std::vector<cudf::size_type> column_starts;
   std::vector<cudf::size_type> column_sizes;
 
@@ -1530,7 +1531,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   size_type fixed_width_size_per_row = detail::compute_column_information(
     iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
 
-  size_type validity_size = num_bitmask_words(num_columns);
+  size_type validity_size = num_bitmask_words(num_columns) * 4;
 
   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
 
@@ -1567,7 +1568,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
-  dim3 blocks(block_infos.size());
+  dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
   #if defined(DEBUG) || 1
   dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
   #else
@@ -1581,6 +1582,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
     num_rows,
     num_columns,
+    shmem_limit_per_block,
     input.offsets().data<size_type>(),
     dev_output_data.data(),
     dev_output_nm.data(),

From 636b235750668dc06d63512574b6b8cee2d263e6 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 7 Jun 2021 08:14:52 +0000
Subject: [PATCH 08/80] working on row and column conversions

---
 cpp/benchmarks/CMakeLists.txt                 |   27 +-
 .../row_conversion/row_conversion.cpp         |  106 +-
 cpp/include/cudf/row_conversion.hpp           |   12 -
 cpp/src/row_conversion/row_conversion.cu      | 1183 +++++------------
 4 files changed, 320 insertions(+), 1008 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7d353c37df7..5cc48436d01 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -29,7 +29,6 @@ target_link_libraries(cudf_datagen
                       GTest::gmock_main
                       GTest::gtest_main
                       benchmark::benchmark
-                      nvbench::nvbench
                       Threads::Threads
                       cudf)
 
@@ -51,19 +50,11 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen)
 function(ConfigureBench CMAKE_BENCH_NAME)
     add_executable(${CMAKE_BENCH_NAME} ${ARGN})
     set_target_properties(${CMAKE_BENCH_NAME}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>")
+        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gbenchmarks>")
     target_link_libraries(${CMAKE_BENCH_NAME}
         PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main)
 endfunction()
 
-function(ConfigureNVBench CMAKE_BENCH_NAME)
-    add_executable(${CMAKE_BENCH_NAME} ${ARGN})
-    set_target_properties(${CMAKE_BENCH_NAME}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>")
-    target_link_libraries(${CMAKE_BENCH_NAME}
-        PRIVATE cudf_benchmark_common cudf_datagen nvbench::main)
-endfunction()
-
 ###################################################################################################
 # - column benchmarks -----------------------------------------------------------------------------
 ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp)
@@ -76,10 +67,6 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu)
 # - scatter benchmark -----------------------------------------------------------------------------
 ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu)
 
-###################################################################################################
-# - lists scatter benchmark -----------------------------------------------------------------------
-ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu)
-
 ###################################################################################################
 # - contiguous_split benchmark  -------------------------------------------------------------------
 ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
@@ -102,8 +89,7 @@ ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchma
 
 ###################################################################################################
 # - join benchmark --------------------------------------------------------------------------------
-ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
-ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
+ConfigureBench(JOIN_BENCH join/join_benchmark.cu)
 
 ###################################################################################################
 # - iterator benchmark ----------------------------------------------------------------------------
@@ -205,7 +191,6 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 # - binaryop benchmark ----------------------------------------------------------------------------
 ConfigureBench(BINARYOP_BENCH
   binaryop/binaryop_benchmark.cpp
-  binaryop/compiled_binaryop_benchmark.cpp
   binaryop/jit_binaryop_benchmark.cpp)
 
 ###################################################################################################
@@ -233,7 +218,6 @@ ConfigureBench(STRINGS_BENCH
   string/factory_benchmark.cu
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
-  string/repeat_strings_benchmark.cpp
   string/replace_benchmark.cpp
   string/replace_re_benchmark.cpp
   string/split_benchmark.cpp
@@ -247,10 +231,5 @@ ConfigureBench(JSON_BENCH
   string/json_benchmark.cpp)
 
 ###################################################################################################
-# - io benchmark ---------------------------------------------------------------------
-ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
-  io/text/multibyte_split_benchmark.cpp)
-
-###################################################################################################
-# - row conversion benchmark ---------------------------------------------------------
+# - row conversion benchmark ----------------------------------------------------------------------------
 ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp)
diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 7c1f52c5cd6..c4edee91b3c 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -25,7 +25,7 @@
 class RowConversion : public cudf::benchmark {
 };
 
-static void BM_old_to_row(benchmark::State& state)
+static void BM_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -37,44 +37,8 @@ static void BM_old_to_row(benchmark::State& state)
                                           cudf::type_id::UINT16,
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
-                                         212,
+                                         50,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-    64,
-    row_count{n_rows});*/
-
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    total_bytes += cudf::size_of(t);
-  }
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto rows = cudf::convert_to_rows(table->view());
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-static void BM_new_to_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         212,
-                                         row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-    64,
-    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -85,13 +49,14 @@ static void BM_new_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
+//    auto rows = cudf::convert_to_rows(table->view());
     auto new_rows = cudf::convert_to_rows2(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-static void BM_old_from_row(benchmark::State& state)
+static void BM_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -105,6 +70,9 @@ static void BM_old_from_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          256,
                                          row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+                                           4,
+                                           row_count{n_rows});*/
 
   std::vector<cudf::data_type> schema;
   cudf::size_type total_bytes = 0;
@@ -125,60 +93,24 @@ static void BM_old_from_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-static void BM_new_from_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         256,
-                                         row_count{n_rows});
-
-  std::vector<cudf::data_type> schema;
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    schema.push_back(t);
-    total_bytes += cudf::size_of(t);
-  }
-
-  auto rows = cudf::convert_to_rows(table->view());
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto out = cudf::convert_from_rows2(rows, schema);
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)               \
-  (::benchmark::State & st) { f(st); }                  \
-  BENCHMARK_REGISTER_F(RowConversion, name)             \
-    ->RangeMultiplier(8)                                \
-    ->Ranges({{1 << 6, 1 << 20}})                       \
-    ->UseManualTime()                                   \
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+  BENCHMARK_DEFINE_F(RowConversion, name)        \
+  (::benchmark::State & st) { BM_to_row(st); }   \
+  BENCHMARK_REGISTER_F(RowConversion, name)      \
+    ->RangeMultiplier(8)                         \
+    ->Ranges({{1 << 16, 1 << 24}})               \
+    ->UseManualTime()                            \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
 
-#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
-  (::benchmark::State & st) { f(st); }   \
+  (::benchmark::State & st) { BM_from_row(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)        \
     ->RangeMultiplier(8)                           \
-    ->Ranges({{1 << 6, 1 << 20}})                  \
+    ->Ranges({{1 << 6, 1 << 22}})                  \
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index 282ffa4b0cb..f5e2225ad19 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -48,16 +48,4 @@ std::unique_ptr<cudf::table> convert_from_rows(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::table> convert_from_rows2(
-  cudf::lists_column_view const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows2(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
 }  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 0879a1c50a5..fb5dc4cb38d 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -14,14 +14,12 @@
  * limitations under the License.
  */
 
-#include <algorithm>
 #include <iostream>
 #include <iterator>
 #include <limits>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -31,15 +29,11 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cudf/row_conversion.hpp>
-#include <tuple>
 #include "cudf/types.hpp"
 #include "rmm/device_buffer.hpp"
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
-#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2)
-
-using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
 namespace detail {
@@ -49,6 +43,34 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
+
+/**
+ * Copy a simple vector to device memory asynchronously. Be sure to read
+ * the data on the same stream as is used to copy it.
+ */
+template <typename T>
+std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource *mr)
+{
+  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
+  CUDA_TRY(cudaMemcpyAsync(
+    ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
+  return ret;
+}
+
+template <typename T>
+rmm::device_uvector<T> copy_to_dev_async2(
+  const std::vector<T> &input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  rmm::device_uvector<T> ret(input.size(), stream, mr);
+  CUDA_TRY(cudaMemcpyAsync(
+    ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
+  return ret;
+}
+
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type num_columns,
                                             const cudf::size_type row_size,
@@ -162,7 +184,7 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         cudf::size_type byte_bit_offset = col_index % 8;
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; }
+        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
       }  // end column loop
     }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
@@ -311,20 +333,6 @@ struct block_info {
   int buffer_num;
 };
 
-// When building the columns to return, we have to be mindful of the offset limit in cudf.
-// It is 32-bit and these data columns are capable of surpassing that easily. The data should
-// not be cut off exactly at the limit though due to the validity buffers. The most efficient
-// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-// we keep track of the cut points for the validity, which we call row batches. If the row
-// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-// the outgoing data stream.
-struct row_batch {
-  size_type num_bytes;
-  size_type row_count;
-};
-
 /**
  * @brief copy data from cudf columns into x format, which is row-based
  *
@@ -337,16 +345,16 @@ struct row_batch {
  * @param block_infos information about the blocks of work
  * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data
- *
+ * 
  */
-__global__ void copy_from_columns(const size_type num_rows,
-                                  const size_type num_columns,
+__global__ void copy_from_columns(const cudf::size_type num_rows,
+                                  const cudf::size_type num_columns,
                                   const int8_t **input_data,
-                                  const bitmask_type **input_nm,
-                                  const size_type *col_sizes,
-                                  const size_type *col_offsets,
+                                  const cudf::bitmask_type **input_nm,
+                                  const cudf::size_type *col_sizes,
+                                  const cudf::size_type *col_offsets,
                                   const block_info *block_infos,
-                                  const size_type *row_offsets,
+                                  const uint64_t *row_offsets,
                                   int8_t **output_data)
 {
   // We are going to copy the data in two passes.
@@ -357,119 +365,46 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
-
-  if (debug_print) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("Column Info:\n");
-    for (int i = 0; i < num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n",
-             i,
-             input_data[i],
-             col_sizes[i],
-             col_offsets[i]);
-    }
-    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-    /*    printf("Row Offsets:\n");
-        for (int i=0; i<num_rows; ++i) {
-          printf("%d: %d\n", i, row_offsets[i]);
-        }*/
-    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
-  }
-  // else { return; }
-  auto block               = block_infos[blockIdx.x];
-  auto const rows_in_block = block.end_row - block.start_row + 1;
+  auto block = block_infos[blockIdx.x];
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
-  uint8_t const dest_shim_offset =
-    reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) &
-    7;  // offset for alignment shim in order to match shared memory with final dest
-  if (debug_print) {
-    printf("outputting to offset %lu\n", output_start_offset);
-    printf("dest shim offset is %d\n", dest_shim_offset);
-    printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
-    printf("my block is %d,%d -> %d,%d - buffer %d\n",
-           block.start_col,
-           block.start_row,
-           block.end_col,
-           block.end_row,
-           block.buffer_num);
-  }
+  uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
+
+    printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x);
+
   // each thread is responsible for every threadcount rows of data.
-  // the data is copied into shared memory in the final layout.
-  auto const real_bytes_in_row =
-    col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
-  auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
-                                           8);  // 8 byte alignment required for shared memory rows
+  // the data is copies into shared memory in the final layout.
+  auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
-  if (debug_print) {
-    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n",
-           block.end_col,
-           col_offsets[block.end_col],
-           block.end_col,
-           col_sizes[block.end_col],
-           block.start_col,
-           col_offsets[block.start_col]);
-    printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row);
-    printf("validity offset is %d\n", validity_offset);
-    printf("starting at %d,%d and going to %d, %d\n",
-           block.start_col,
-           block.start_row,
-           block.end_col,
-           block.end_row);
-  }
-  for (int col = block.start_col; col <= block.end_col; ++col) {
-    /*if (!col_is_variable) */ {
-      uint64_t col_offset      = 0;
+  for (int col=block.start_col; col<=block.end_col; ++col) {
+    /*if (!col_is_variable) */{
+      uint64_t col_offset = 0;
       cudf::size_type col_size = col_sizes[col];
-      auto const dest_col_offset =
-        col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
-      if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
-      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
-        if (debug_print) {
-          printf("shmem row %d(%d) at offset %d(%d)\n",
-                 row - block.start_row,
-                 row,
-                 (row - block.start_row) * shmem_row_size,
-                 row * shmem_row_size);
-        }
-        int8_t *shmem_dest =
-          &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
+      auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
+        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * row];
         switch (col_size) {
           case 1: {
-            if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); }
             *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
-            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_data[col]);
-            if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); }
+            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
-            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_data[col]);
-            if (debug_print) {
-              printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]);
-            }
+            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
-            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_data[col]);
-            if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); }
+            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
-            if (debug_print) {
-              printf("byte for byte copy due to size %d of column %d\n", col_size, col);
-              printf("%p <- input_data[%d] which is %d\n",
-                     shmem_dest,
-                     input_offset,
-                     input_data[col][input_offset]);
-            }
             // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
@@ -482,13 +417,11 @@ __global__ void copy_from_columns(const size_type num_rows,
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
-        int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col  / 8)];
+        int8_t *valid_byte              = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
         cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
-        if (debug_print) { printf("Outputting validity to %p\n", valid_byte); }
         // Now copy validity for the column
         if (input_nm[col]) {
           if (bit_is_set(input_nm[col], row)) {
@@ -500,11 +433,11 @@ __global__ void copy_from_columns(const size_type num_rows,
           // It is valid so just set the bit
           atomicOr_block(valid_int, 1 << int_bit_offset);
         }
-      }  // end row
+      } // end row
 
-      col_offset += col_sizes[col] * rows_in_block;
+      col_offset += col_sizes[col] * (block.end_row - block.start_row);
     }
-  }  // end col
+  } // end col
 
   // wait for the data to be totally copied into shared memory
   __syncthreads();
@@ -517,311 +450,35 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride       = blockDim.x * 8;
-  auto const end_offset          = shmem_row_size * rows_in_block;
-
-  if (debug_print) {
-    printf("writing final data from %d to %d at stride %d\n",
-           thread_start_offset,
-           shmem_row_size * rows_in_block,
-           thread_stride);
-    printf("rows in block %d\n", rows_in_block);
-  }
-  for (auto src_offset = thread_start_offset; src_offset < end_offset;
-       src_offset += thread_stride) {
+  auto const thread_stride = gridDim.x * 8;
+  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
-    auto const row_offset     = row_offsets[block.start_row + output_row_num];
-    auto const col_offset     = src_offset % shmem_row_size;
-    int8_t *output_ptr        = &output_data[block.buffer_num][row_offset + col_offset];
-    int8_t *input_ptr         = &shared_data[src_offset];
-
-    // three cases to worry about here
-    // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front
-    // 2) last 8-byte part of a large row - some bytes of pad at the end
-    // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front
-    // AND potentially pad at the rear
-
-    // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily.
-    // 1st case is when we're at some even multiple of shmem_row_size offset.
-    // 2nd case is when offset + 8 is some even multiple of shmem_row_size.
-    // must be an 8 byte copy
-
-    // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize?
-    if (real_bytes_in_row + dest_shim_offset <= 8) {
-      // case 3, we want to copy real_bytes_in_row bytes
-      auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf("case 3 - %d single byte final write %p(%d) -> %p\n",
-                 num_single_bytes,
-                 &input_ptr[i + dest_shim_offset],
-                 input_ptr[i + dest_shim_offset],
-                 &output_ptr[i]);
-        }
-        output_ptr[i] = input_ptr[i + dest_shim_offset];
-      }
-    } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
-      // first byte with leading pad
+    auto const row_offset = row_offsets[block.start_row + output_row_num];
+    auto const col_offset = src_offset % shmem_row_size;
+    int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset];
+    int8_t *input_ptr = &shared_data[src_offset];
+    // the first part and last part of the row is unaligned data copy. This is copied a single byte
+    // at a time.
+    if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
+      // first part of a row, copy single bytes
       auto const num_single_bytes = 8 - dest_shim_offset;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf(
-            "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]);
-        }
+      for (auto i=0; i<num_single_bytes; ++i) {
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
-    } else if ((src_offset + 8) % shmem_row_size == 0 &&
-               (real_bytes_in_row + dest_shim_offset) % 8 > 0) {
-      // last bytes of a row
-      auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf("single trailing byte final write %p -> %p\n",
-                 &input_ptr[i + dest_shim_offset],
-                 &output_ptr[i]);
-        }
+    } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) {
+      // last part of a row, copy single bytes
+      auto const num_single_bytes = dest_shim_offset;
+      for (auto i=0; i<num_single_bytes; ++i) {
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
     } else {
       // copy 8 bytes aligned
-      const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_ptr);
-      if (debug_print) {
-        printf(
-          "long final write %p -> %p\n", long_col_input, reinterpret_cast<int64_t *>(output_ptr));
-      }
+      const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_ptr);
       *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
     }
   }
 }
 
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
- * @param output_data
- * @param output_nm
- * @param col_sizes array of sizes for each element in a column - one per column
- * @param col_offsets offset into input data row for each column's start
- * @param block_infos information about the blocks of work
- * @param input_data pointer to input data
- *
- */
-__global__ void copy_to_columns(const size_type num_rows,
-                                const size_type num_columns,
-                                const size_type shmem_used_per_block,
-                                const size_type *offsets,
-                                int8_t **output_data,
-                                cudf::bitmask_type **output_nm,
-                                const size_type *col_sizes,
-                                const size_type *col_offsets,
-                                const block_info *block_infos,
-                                const int8_t *input_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // This has been broken up for us in the block_info struct, so we don't have
-  // any calculation to do here, but it is important to note.
-
-  constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
-
-  if (debug_print) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-    /*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-    printf("%d: %d\n", i, row_offsets[i]);
-    }*/
-    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
-  }
-//  else { return; }
-
-  for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) {
-    auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset;
-    if (this_block_index > blockDim.x) {
-      break;
-    }
-    auto block               = block_infos[this_block_index];
-  auto const rows_in_block = block.end_row - block.start_row + 1;
-  auto const cols_in_block = block.end_col - block.start_col + 1;
-  extern __shared__ int8_t shared_data[];
-
-  // copy data from our block's window to shared memory
-  // offsets information can get us on the row, then we need to know where the column
-  // starts to offset into the row data.
-
-  // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding
-  // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the
-  // thread may copy less than 8 bytes. Even if at the beginning of the window, because
-  // every internal copy is aligned to 8-byte boundaries.
-  //
-  //  thread 0 thread 1 thread 2 thread 3 thread 4 thread 5
-  //  01234567 89abcdef 01234567 89abcdef 01234567 89abcdef
-  //  xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx
-  // |        |        |        |        |        |        |
-  //
-  //
-
-  auto const window_start_quad = col_offsets[block.start_col] / 8;
-  auto const window_end_quad   = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8;
-  auto const window_quad_width = window_end_quad - window_start_quad;
-  auto const total_quads       = window_quad_width * rows_in_block;
-  auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7;
-
-  if (debug_print) {
-    printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]);
-    printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad);
-    printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad);
-  }
-
-  // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't
-  // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will
-  // result in the window edges being duplicated across blocks, but we can copy the padding as well
-  // to speed up our transfers to shared memory.
-  for (int i = threadIdx.x; i < total_quads; i += blockDim.x) {
-    auto const relative_row = i / window_quad_width;
-    auto const absolute_row = relative_row + block.start_row;
-    //auto const row           = i / window_quad_width;
-    auto const offset_in_row = i % window_quad_width * 8;
-    auto const shmem_dest    = &shared_data[i * 8];
-
-    if (debug_print) {
-      printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest);
-      printf("offsets is %p\n", offsets);
-      printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]);
-      printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row);
-    }
-
-    // full 8-byte copy
-    const int64_t *long_col_input =
-      reinterpret_cast<const int64_t *>(&input_data[offsets[absolute_row] + offset_in_row]);
-    if (debug_print) { 
-      printf("which will be address %p\n", long_col_input);
-      printf("%p <- long %lu\n", shmem_dest, *long_col_input); }
-    *reinterpret_cast<int64_t *>(shmem_dest) = *long_col_input;
-  }
-
-  __syncthreads();
-
-  // now we copy from shared memory to final destination.
-  // the data is laid out in rows in shared memory, so the reads
-  // for a column will be "vertical". Because of this and the different
-  // sizes for each column, this portion is handled on row/column basis.
-  // to prevent each thread working on a single row and also to ensure
-  // that all threads can do work in the case of more threads than rows,
-  // we do a global index instead of a double for loop with col/row.
-  for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
-    auto const relative_col = index % cols_in_block;
-    auto const relative_row = index / cols_in_block;
-    auto const absolute_col = relative_col + block.start_col;
-    auto const absolute_row = relative_row + block.start_row;
-
-    auto const shared_memory_row_offset = window_quad_width * 8 * relative_row;
-    auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] +
-                                      shared_memory_row_offset + shared_memory_starting_pad;
-    auto const column_size = col_sizes[absolute_col];
-
-    int8_t *shmem_src = &shared_data[shared_memory_offset];
-    int8_t *dst       = &output_data[absolute_col][absolute_row * column_size];
-
-    if (debug_print) {
-      printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
-      " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size,
-    shmem_src, dst) ;
-    }
-    switch (column_size) {
-      case 1: {
-        if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); }
-        *dst = *shmem_src;
-        break;
-      }
-      case 2: {
-        const int16_t *short_col_input = reinterpret_cast<const int16_t *>(shmem_src);
-        if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); }
-        *reinterpret_cast<int16_t *>(dst) = *short_col_input;
-        break;
-      }
-      case 4: {
-        const int32_t *int_col_input = reinterpret_cast<const int32_t *>(shmem_src);
-        if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); }
-        *reinterpret_cast<int32_t *>(dst) = *int_col_input;
-        break;
-      }
-      case 8: {
-        const int64_t *long_col_input = reinterpret_cast<const int64_t *>(shmem_src);
-        if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); }
-        *reinterpret_cast<int64_t *>(dst) = *long_col_input;
-        break;
-      }
-      default: {
-        if (debug_print) {
-          printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col);
-        }
-        // TODO this should just not be supported for fixed width columns, but just in case...
-        for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; }
-        break;
-      }
-    }
-  }
-
-  // now handle validity. Each thread is responsible for 32 rows in 8 columns.
-  // to prevent indexing issues with a large number of threads, this is compressed
-  // to a single loop like above. TODO: investigate using shared memory here
-  auto const validity_batches_per_col = (num_rows + 31) / 32;
-  auto const validity_batches_total   = std::max(1, validity_batches_per_col * (num_columns / 8));
-  if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) {
-    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x);
-  }
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) {
-    auto const start_col       = (index * 8) / validity_batches_per_col;
-    auto const batch           = index % validity_batches_per_col;
-    auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + (start_col / 8);
-
-    if (debug_print) {
-      printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x);
-    }
-
-    // one for each column
-    int32_t dst_validity[8] = {0};
-    for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
-      int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
-
-      if (debug_print) {
-        printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row);
-      }
-  
-      auto const val_byte     = *validity_ptr;
-
-      for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
-        auto const src_shift    = (start_col + i) % 8;
-        auto const dst_shift    = row % 32;
-        auto const src_bit_mask = 1 << src_shift;
-        if (debug_print) {
-          printf("%d-%d: src bit mask is 0x%x, src shift is 0x%x and dst shift is 0x%x, validity bit is 0x%x\n", threadIdx.x, blockIdx.x, src_bit_mask, src_shift, dst_shift, (val_byte & src_bit_mask) >> src_shift);
-        }
-  //      auto const dst_bit_mask = 1 << dst_shift;
-        dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
-      }
-    }
-    
-
-    for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
-      int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[start_col + i] + (starting_row / 32));
-      if (debug_print) {
-        printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]);
-      }
-      *validity_ptr         = dst_validity[i];
-    }
-  }
-}
-}
-
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
  * @param [in] num_columns the number of columns being copied.
@@ -895,10 +552,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   const cudf::size_type num_rows,
   const cudf::size_type num_columns,
   const cudf::size_type size_per_row,
-  rmm::device_uvector<cudf::size_type> &column_start,
-  rmm::device_uvector<cudf::size_type> &column_size,
-  rmm::device_uvector<const int8_t *> &input_data,
-  rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
+  std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
+  std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
   const cudf::scalar &zero,
   const cudf::scalar &scalar_size_per_row,
   rmm::cuda_stream_view stream,
@@ -929,10 +586,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
     num_rows,
     num_columns,
     size_per_row,
-    column_start.data(),
-    column_size.data(),
-    input_data.data(),
-    input_nm.data(),
+    column_start->data(),
+    column_size->data(),
+    input_data->data(),
+    input_nm->data(),
     data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows,
@@ -986,165 +643,21 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   return align_offset(at_offset, 8);  // 8 bytes (64 bits)
 }
 
-template <typename iterator>
-static size_type compute_column_information(
-  iterator begin,
-  iterator end,
-  std::vector<size_type> &column_starts,
-  std::vector<size_type> &column_sizes)//,
-  //std::function<void(T)> nested_type_cb)
-{
-  size_type fixed_width_size_per_row = 0;
-  for (auto cv = begin; cv != end; ++cv) {
-    auto col_type    = std::get<0>(*cv);
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-//    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
-
-    // a list or string column will write a single uint64
-    // of data here for offset/length
-    auto col_size = nested_type ? 8 : size_of(col_type);
-
-    // align size for this type
-    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-    column_starts.push_back(fixed_width_size_per_row);
-    column_sizes.push_back(col_size);
-    fixed_width_size_per_row += col_size;
-  }
-
-  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
-  column_starts.push_back(validity_offset);
-
-  return fixed_width_size_per_row;
-}
+}  // namespace detail
 
 //#define DEBUG
-
-static std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
-                                                 std::vector<size_type> const &column_starts,
-                                                 std::vector<row_batch> const &row_batches,
-                                                 size_type const total_number_of_rows,
-                                                 size_type const &shmem_limit_per_block)
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr)
 {
-  std::vector<block_info> block_infos;
-
-  // block infos are organized with the windows going "down" the columns
-  // this provides the most coalescing of memory access
-  int current_window_width     = 0;
-  int current_window_start_col = 0;
-
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
-                        int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int i                        = 0;
-    while (i < total_number_of_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(detail::block_info{
-        start_col,
-        current_window_start_row,
-        end_col,
-        std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
-        current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
-  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-  // would be memory cache line sized access, but since other blocks will read/write the edges this
-  // may not turn out to be overly important. For now, we will attempt to build a square window as
-  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
-  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
-  // bytes, not rows or columns.
-  int const window_height = std::min(
-    std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows),
-    row_batches[0].row_count);
-#if defined(DEBUG)
-  printf(
-    "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height "
-    "%d\n",
-    size_type(sqrt(shmem_limit_per_block)) / column_sizes[0],
-    total_number_of_rows,
-    row_batches[0].row_count,
-    window_height);
-#endif
-
-  int row_size = 0;
-
-  // march each column and build the blocks of appropriate sizes
-  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
-    auto const col_size = column_sizes[col];
-
-    // align size for this type
-    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
-    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
-    auto row_size_with_this_col  = row_size_aligned + col_size;
-    auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
-
-    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
-#if defined(DEBUG)
-      printf(
-        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
-        "for shared mem size %d\n",
-        row_size_with_end_pad * window_height,
-        col,
-        row_size * window_height,
-        current_window_start_col,
-        col - 1,
-        window_height,
-        row_size_with_end_pad,
-        row_size,
-        row_size_aligned,
-        shmem_limit_per_block);
-#endif
-      // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
-      row_size =
-        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-#if defined(DEBUG)
-      printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
-        "or %d)\n",
-        row_size,
-        col_size,
-        row_size + col_size,
-        column_starts[col - 1],
-        column_sizes[col - 1],
-        column_starts[col - 1] + column_sizes[col - 1]);
-#endif
-      row_size += col_size;  // alignment required for shared memory window boundary to match
-                             // alignment of output row
-      current_window_start_col = col;
-      current_window_width     = 0;
-    } else {
-      row_size = row_size_with_this_col;
-      current_window_width++;
-    }
-  }
-
-  // build last set of blocks
-  if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height);
-  }
-
-  return block_infos;
-}
-}  // namespace detail
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough
+  // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes.
+  constexpr int max_window_height = 1024;
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows    = tbl.num_rows();
 
-#if defined(DEBUG)
-  void pretty_print(uint64_t i) {
+  #if defined(DEBUG)
+  auto pretty_print = [](uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
     } else if (i > (1 * 1024 * 1024)) {
@@ -1154,18 +667,8 @@ static std::vector<block_info> build_block_infos(std::vector<size_type> const &c
     } else {
       printf("%lu Bytes", i);
     }
-  }
-#endif
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr)
-{
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
-  const size_type num_columns = tbl.num_columns();
-  const size_type num_rows    = tbl.num_rows();
+  };
+  #endif
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
@@ -1173,12 +676,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-#if defined(DEBUG)
-  size_t free, total;
-  cudaMemGetInfo(&free, &total);
-  printf("%lu/%lu Memory\n", free, total);
-#endif
-
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
   // we want a single block to fill up the entire shared memory space available
@@ -1194,78 +691,50 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
-  // Get the pointers to the input columnar data ready
-  std::vector<const int8_t *> input_data;
-  std::vector<bitmask_type const *> input_nm;
-  input_data.reserve(num_columns);
-  input_nm.reserve(num_columns);
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv      = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
-
-  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-  auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
-
-  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
+  std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
+  std::vector<uint64_t> row_offsets; // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
-  std::vector<size_type> column_starts;  // offset of column inside a row including alignment
-  std::vector<column_view>
-    variable_width_columns;  // list of the variable width columns in the table
+  std::vector<size_type> column_starts; // offset of column inside a row including alignment
+  std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
   row_sizes.reserve(num_rows);
   row_offsets.reserve(num_rows);
   column_sizes.reserve(num_columns);
-  column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
-
-  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-    return std::make_tuple(tbl.column(i).type(), tbl.column(i));
-  });
-
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter,
-    iter + num_columns,
-    column_starts,
-    column_sizes);//,
-//    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
-  /*  size_type fixed_width_size_per_row = 0;
-    for (int col = 0; col < num_columns; ++col) {
-      auto cv          = tbl.column(col);
-      auto col_type    = cv.type();
-      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-      if (nested_type) { variable_width_columns.push_back(cv); }
-
-      // a list or string column will write a single uint64
-      // of data here for offset/length
-      auto col_size = nested_type ? 8 : size_of(col_type);
-
-      // align size for this type
-      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-      column_starts.push_back(fixed_width_size_per_row);
-      column_sizes.push_back(col_size);
-      fixed_width_size_per_row += col_size;
-    }*/
-
-#if defined(DEBUG)
-  printf("validity offset will be %d + %d = %d\n",
-         column_starts.back(),
-         column_sizes.back(),
-         column_starts.back() + column_sizes.back());
-#endif
-
-
-  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
-  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-
-  std::vector<detail::row_batch> row_batches;
+  column_starts.reserve(num_columns+1); // we add a final offset for validity data start
+
+  size_type fixed_width_size_per_row = 0;
+  for (int col = 0; col < num_columns; ++col) {
+    auto cv = tbl.column(col);
+    auto col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (nested_type) { variable_width_columns.push_back(cv);}
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed  = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row                  = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+  
+  // When building the columns to return, we have to be mindful of the offset limit in cudf.
+  // It is 32-bit and these data columns are capable of surpassing that easily. The data should
+  // not be cut off exactly at the limit though due to the validity buffers. The most efficient
+  // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+  // we keep track of the cut points for the validity, which we call row batches. If the row
+  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit.
+  // Note that this boundary is for our book-keeping with column pointers and not anything
+  // that the kernel needs to worry about. We cut the output at convienient boundaries
+  // when assembling the outgoing data stream.
+  struct row_batch {
+    size_type num_bytes;
+    size_type row_count;
+  };
+  std::vector<row_batch> row_batches;
 
   auto calculate_variable_width_row_data_size = [](int const row) {
     // each level of variable-width data will add an offset/length
@@ -1277,156 +746,210 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     // will be included in the variable-width data blob at the end of the
     // row.
     return 0;
-    /*      auto c = variable_width_columns[col];
-            while (true) {
-              auto col_offsets   = c.child(0).data<size_type>();
-              auto col_data_size = size_of(c.child(1).type());
-              std::size_t alignment_needed  = col_data_size;
-
-            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-            if (c.num_children() == 0) {
-              break;
-            }
-            c = c.child(1);
-          }
-    */
+/*      auto c = variable_width_columns[col];
+        while (true) {
+          auto col_offsets   = c.child(0).data<size_type>();
+          auto col_data_size = size_of(c.child(1).type());
+          std::size_t alignment_needed  = col_data_size;
+    
+        row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+        if (c.num_children() == 0) {
+          break;
+        }
+        c = c.child(1);
+      }
+*/
   };
 
   uint64_t row_batch_size   = 0;
   uint64_t total_table_size = 0;
-  size_type row_batch_rows  = 0;
-  uint64_t row_offset       = 0;
+  size_type row_batch_rows = 0;
+  uint64_t row_offset = 0;
 
-  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
-  // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = num_bitmask_words(num_columns) * 4;
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate
+  // the size of each row's variable-width data as well.
   for (int row = 0; row < num_rows; ++row) {
-    auto aligned_row_batch_size =
-      detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
-    row_sizes[row] = fixed_width_size_per_row;
-    // validity is byte aligned
-    row_sizes[row] += validity_size;
-    // variable width data is 8-byte aligned
-    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
-                     calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
-
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
-        (uint64_t)std::numeric_limits<size_type>::max()) {
+    row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row);
+    if (row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
-      row_batches.push_back(
-        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size         = 0;
-      row_batch_rows         = row_batch_rows & 31;
-      row_offset             = 0;
-      aligned_row_batch_size = 0;
+      row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size = 0;
+      row_batch_rows = row_batch_rows & 31;
+      row_offset = 0;
     }
-    row_offset = detail::align_offset(row_offset, 8);  // rows are 8 byte aligned
+    row_offset                  = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
     row_offsets.push_back(row_offset);
-    row_batch_size = aligned_row_batch_size + row_sizes[row];
+    row_batch_size += row_sizes[row];
     row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8);  // rows are 8 byte aligned
+    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
     total_table_size += row_sizes[row];
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
   }
 
-  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
-
-#if defined(DEBUG)
-  printf("%d rows and %d columns in table\n", num_rows, num_columns);
+  #if defined(DEBUG)
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
     printf("%d: %d rows, ", i, row_batches[i].row_count);
     pretty_print(row_batches[i].num_bytes);
     printf("\n");
   }
-#endif
+  #endif
 
-  std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
-    output_buffers.push_back(std::move(temp));
+  std::vector<detail::block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_size      = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+    int i = 0;
+    while (i < num_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(
+        detail::block_info{start_col,
+                   current_window_start_row,
+                   start_col + end_col,
+                   std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (int col = 0; col < num_columns; ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed  = col_size;  // They are the same for fixed width types
+    auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size;
+
+    if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row
+      current_window_start_col = col;
+    } else {
+      row_size = row_size_with_this_col;
+    }
   }
-  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
-  std::vector<detail::block_info> block_infos =
-    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+  auto validity_offset = detail::align_offset(column_starts.back(), 4);
+  column_starts.push_back(validity_offset);
+  
+  // build last set of blocks
+  if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); }
+
+  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things
+  std::vector<const int8_t *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
 
-#if defined(DEBUG)
-  printf("%lu windows for %d columns, %d rows to fit in ",
-         block_infos.size(),
-         block_infos[0].end_col - block_infos[0].start_col + 1,
-         block_infos[0].end_row - block_infos[0].start_row);
+  #if defined(DEBUG)
+  printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row);
   pretty_print(shmem_limit_per_block);
   printf(" shared mem(");
   pretty_print(fixed_width_size_per_row);
   printf("/row, %d columns, %d rows, ", num_columns, num_rows);
   pretty_print(total_table_size);
   printf(" total):\n");
-#endif
+  #endif
 
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts   = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  std::vector<rmm::device_buffer> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i=0; i<row_batches.size(); ++i) {
+    output_data.push_back(rmm::device_buffer(row_batches[i].num_bytes, stream, mr));
+  }
+  auto dev_output_data   = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks(block_infos.size());
-  #if defined(DEBUG) || 1
-  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size));
-  #else
-  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size));
-  #endif
-#if defined(DEBUG)
-  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  pretty_print(shmem_limit_per_block);
-  printf(" shared memory\n");
-#endif
-  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
-    num_rows,
-    num_columns,
-    dev_input_data.data(),
-    dev_input_nm.data(),
-    dev_col_sizes.data(),
-    dev_col_starts.data(),
-    dev_block_infos.data(),
-    dev_row_offsets.data(),
-    reinterpret_cast<int8_t **>(dev_output_data.data()));
+  dim3 blocks;
+  dim3 threads;
+  blocks.x  = block_infos.size();
+  blocks.y  = 0;
+  blocks.z  = 0;
+  threads.x = 1024;
+  threads.y = 0;
+  threads.z = 0;
+  detail::copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
+                                                                                num_columns,
+                                                                                dev_input_data.data(),
+                                                                                dev_input_nm.data(),
+                                                                                dev_col_sizes.data(),
+                                                                                dev_col_starts.data(),
+                                                                                dev_block_infos.data(),
+                                                                                dev_row_offsets.data(),
+                                                                                reinterpret_cast<int8_t **>(dev_output_data.data()));
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
   int offset_offset = 0;
   std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i = 0; i < row_batches.size(); ++i) {
+  for (uint i=0; i<row_batches.size(); ++i) {
+  
     // compute offsets for this row batch
     std::vector<size_type> offset_vals;
     offset_vals.reserve(row_batches[i].row_count + 1);
     size_type cur_offset = 0;
     offset_vals.push_back(cur_offset);
-    for (int row = 0; row < row_batches[i].row_count; ++row) {
-      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
+    for (int row=0; row<row_batches[i].row_count; ++row) {
+      cur_offset += row_sizes[row + offset_offset];
       offset_vals.push_back(cur_offset);
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
-    auto offsets     = std::make_unique<column>(
-      data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
+    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto offsets =
+      std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
-    auto data = std::make_unique<column>(
-      data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i]));
+    auto data =
+      std::make_unique<column>(data_type{cudf::type_id::INT8},
+                                row_batches[i].num_bytes,
+                                std::move(output_data[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
-                                          std::move(offsets),
-                                          std::move(data),
-                                          0,
-                                          rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-                                          stream,
-                                          mr));
+      std::move(offsets),
+      std::move(data),
+      0,
+      rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+      stream,
+      mr));
   }
-
+  
   return ret;
 }
 
@@ -1445,8 +968,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     std::vector<cudf::size_type> column_size;
 
     int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
-    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
+    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
 
     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
@@ -1463,16 +986,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
-    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
+    auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr);
+    auto dev_input_nm   = detail::copy_to_dev_async(input_nm, stream, mr);
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    zero->set_valid_async(true, stream);
+    zero->set_valid(true, stream);
     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
 
     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    step->set_valid_async(true, stream);
+    step->set_valid(true, stream);
     static_cast<ScalarType *>(step.get())
       ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
 
@@ -1500,100 +1023,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &input,
-                                                std::vector<cudf::data_type> const &schema,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource *mr)
-{
-  // verify that the types are what we expect
-  cudf::column_view child = input.child();
-  cudf::type_id list_type = child.type().id();
-  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-               "Only a list of bytes is supported as input");
-
-  cudf::size_type num_columns = schema.size();
-  cudf::size_type num_rows    = input.parent().size();
-
-  int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
-  int shmem_limit_per_block;
-  CUDA_TRY(
-    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
-
-  shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS;
-
-  std::vector<cudf::size_type> column_starts;
-  std::vector<cudf::size_type> column_sizes;
-
-  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
-    return std::make_tuple(schema[i], nullptr);
-  });
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
-
-  size_type validity_size = num_bitmask_words(num_columns) * 4;
-
-  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
-
-  // Ideally we would check that the offsets are all the same, etc. but for now
-  // this is probably fine
-  CUDF_EXPECTS(row_size * num_rows == child.size(),
-               "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
-
-  // build the row_batches from the passed in list column
-  std::vector<detail::row_batch> row_batches;
-
-  row_batches.push_back(detail::row_batch{child.size(), num_rows});
-
-  // Allocate the columns we are going to write into
-  std::vector<std::unique_ptr<cudf::column>> output_columns;
-  std::vector<int8_t *> output_data;
-  std::vector<cudf::bitmask_type *> output_nm;
-  for (cudf::size_type i = 0; i < num_columns; i++) {
-    auto column = cudf::make_fixed_width_column(
-      schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
-    auto mut = column->mutable_view();
-    output_data.emplace_back(mut.data<int8_t>());
-    output_nm.emplace_back(mut.null_mask());
-    output_columns.emplace_back(std::move(column));
-  }
-
-  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-  auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
-
-  std::vector<detail::block_info> block_infos =
-    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
-
-  dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
-  #if defined(DEBUG) || 1
-  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
-  #else
-  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size()));
-  #endif
-#if defined(DEBUG)
-  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  pretty_print(shmem_limit_per_block);
-  printf(" shared memory\n");
-#endif
-  detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
-    num_rows,
-    num_columns,
-    shmem_limit_per_block,
-    input.offsets().data<size_type>(),
-    dev_output_data.data(),
-    dev_output_nm.data(),
-    dev_col_sizes.data(),
-    dev_col_starts.data(),
-    dev_block_infos.data(),
-    child.data<int8_t>());
-
-  return std::make_unique<cudf::table>(std::move(output_columns));
-}
-
 std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
                                                std::vector<cudf::data_type> const &schema,
                                                rmm::cuda_stream_view stream,
@@ -1618,8 +1047,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     // this is probably fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = make_device_uvector_async(column_start, stream);
-    auto dev_column_size = make_device_uvector_async(column_size, stream);
+    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -1634,8 +1063,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       output_columns.emplace_back(std::move(column));
     }
 
-    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
+    auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr);
+    auto dev_output_nm   = detail::copy_to_dev_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
@@ -1646,10 +1075,10 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       num_rows,
       num_columns,
       size_per_row,
-      dev_column_start.data(),
-      dev_column_size.data(),
-      dev_output_data.data(),
-      dev_output_nm.data(),
+      dev_column_start->data(),
+      dev_column_size->data(),
+      dev_output_data->data(),
+      dev_output_nm->data(),
       child.data<int8_t>());
 
     return std::make_unique<cudf::table>(std::move(output_columns));
@@ -1674,20 +1103,4 @@ std::unique_ptr<cudf::table> convert_from_rows(
   //    }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows2(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
-{
-  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
-
-  //    for (uint i=0; i<input.size(); ++i) {
-  cudf::lists_column_view lcv = input[0]->view();
-  auto ret                    = convert_from_rows2(lcv, schema, stream, mr);
-
-  return ret;
-  //    }
-}
-
 }  // namespace cudf

From 3bff2aad0834b29b37df68f4b1d9cdf5e01e5742 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 10 Jun 2021 17:53:09 +0000
Subject: [PATCH 09/80] fixing kernel launch and updating

---
 .../row_conversion/row_conversion.cpp         |   9 +-
 cpp/src/row_conversion/row_conversion.cu      | 105 +++++++++++++-----
 2 files changed, 83 insertions(+), 31 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index c4edee91b3c..9fa05c408e5 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -28,7 +28,7 @@ class RowConversion : public cudf::benchmark {
 static void BM_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
+/*  auto const table = create_random_table({cudf::type_id::INT8,
                                           cudf::type_id::INT32,
                                           cudf::type_id::INT16,
                                           cudf::type_id::INT64,
@@ -38,7 +38,10 @@ static void BM_to_row(benchmark::State& state)
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
                                          50,
-                                         row_count{n_rows});
+                                         row_count{n_rows});*/
+  auto const table = create_random_table({cudf::type_id::INT32},
+  64,
+  row_count{n_rows});
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -98,7 +101,7 @@ static void BM_from_row(benchmark::State& state)
   (::benchmark::State & st) { BM_to_row(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)      \
     ->RangeMultiplier(8)                         \
-    ->Ranges({{1 << 16, 1 << 24}})               \
+    ->Ranges({{1 << 6, 1 << 20}})               \
     ->UseManualTime()                            \
     ->Unit(benchmark::kMillisecond);
 
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index fb5dc4cb38d..994233a0700 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <iostream>
 #include <iterator>
 #include <limits>
@@ -347,14 +348,14 @@ struct block_info {
  * @param output_data pointer to output data
  * 
  */
-__global__ void copy_from_columns(const cudf::size_type num_rows,
-                                  const cudf::size_type num_columns,
+__global__ void copy_from_columns(const size_type num_rows,
+                                  const size_type num_columns,
                                   const int8_t **input_data,
-                                  const cudf::bitmask_type **input_nm,
-                                  const cudf::size_type *col_sizes,
-                                  const cudf::size_type *col_offsets,
+                                  const bitmask_type **input_nm,
+                                  const size_type *col_sizes,
+                                  const size_type *col_offsets,
                                   const block_info *block_infos,
-                                  const uint64_t *row_offsets,
+                                  const size_type *row_offsets,
                                   int8_t **output_data)
 {
   // We are going to copy the data in two passes.
@@ -365,47 +366,92 @@ __global__ void copy_from_columns(const cudf::size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
+  bool debug_print = false;
+  
+  if (debug_print) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("Column Info:\n");
+    for (int i=0; i<num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n", i, input_data[i], col_sizes[i], col_offsets[i]);
+    }
+    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+/*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+      printf("%d: %d\n", i, row_offsets[i]);
+    }*/
+    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+  }
   auto block = block_infos[blockIdx.x];
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
   uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
-
-    printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x);
-
+  if (debug_print) {
+    printf("outputting to offset %lu\n", output_start_offset);
+    printf("dest shim offset is %d\n", dest_shim_offset);
+    printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
+  }
   // each thread is responsible for every threadcount rows of data.
   // the data is copies into shared memory in the final layout.
   auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
+  if (debug_print) {
+    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]);
+    printf("shmem row size %d\n", shmem_row_size);
+    printf("validity offset is %d\n", validity_offset);
+    printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row);
+  }
   for (int col=block.start_col; col<=block.end_col; ++col) {
     /*if (!col_is_variable) */{
       uint64_t col_offset = 0;
       cudf::size_type col_size = col_sizes[col];
       auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      if (debug_print) {
+        printf("dest col offset %d\n", dest_col_offset);
+      }
       for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
-        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * row];
+        if (debug_print) {
+          printf("shmem row %d at offset %d\n", row - block.start_row, (row - block.start_row) * shmem_row_size);
+        }
+        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
         switch (col_size) {
           case 1: {
-            *shmem_dest = input_data[col][row];
+            if (debug_print) {
+              printf("%p <- byte %d\n", shmem_dest, input_data[col][row]);
+            }
+              *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
             const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
+            if (debug_print) {
+              printf("%p <- short %d\n", shmem_dest, short_col_input[row]);
+            }
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
             const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            if (debug_print) {
+              printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]);
+            }
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
             const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
+            if (debug_print) {
+              printf("%p <- long %lu\n", shmem_dest, long_col_input[row]);
+            }
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
-            // TODO this should just not be supported for fixed width columns, but just in case...
+            if (debug_print) {
+                printf("byte for byte copy due to size %d\n", col_size);
+                printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]);
+              }
+              // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
             }
@@ -676,6 +722,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
+  #if defined(DEBUG)
+  size_t free, total;
+  cudaMemGetInfo( &free, &total );
+  printf("%lu/%lu Memory", free, total);
+  #endif
+
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
   // we want a single block to fill up the entire shared memory space available
@@ -692,7 +744,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // windows so the windows can be properly cut around them.
 
   std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
-  std::vector<uint64_t> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> row_offsets; // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
   std::vector<size_type> column_starts; // offset of column inside a row including alignment
   std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
@@ -821,7 +873,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
       block_infos.emplace_back(
         detail::block_info{start_col,
                    current_window_start_row,
-                   start_col + end_col,
+                   end_col,
                    std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
 
       i += window_height;
@@ -889,23 +941,20 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
   auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
-  std::vector<rmm::device_buffer> output_data;
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
   output_data.reserve(row_batches.size());
   for (uint i=0; i<row_batches.size(); ++i) {
-    output_data.push_back(rmm::device_buffer(row_batches[i].num_bytes, stream, mr));
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_output_data   = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks;
-  dim3 threads;
-  blocks.x  = block_infos.size();
-  blocks.y  = 0;
-  blocks.z  = 0;
-  threads.x = 1024;
-  threads.y = 0;
-  threads.z = 0;
-  detail::copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
+  dim3 blocks(block_infos.size());
+  dim3 threads(1024);
+  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
                                                                                 num_columns,
                                                                                 dev_input_data.data(),
                                                                                 dev_input_nm.data(),
@@ -932,14 +981,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);  
     auto offsets =
       std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
     auto data =
       std::make_unique<column>(data_type{cudf::type_id::INT8},
                                 row_batches[i].num_bytes,
-                                std::move(output_data[i]));
+                                std::move(output_buffers[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
       std::move(offsets),

From 8e52ba174b06f11ecd12a9f3fe35de17ade4f9e6 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 16 Jun 2021 19:25:57 +0000
Subject: [PATCH 10/80] Updates and bug fixing

---
 .../row_conversion/row_conversion.cpp         |  76 ++-
 cpp/src/row_conversion/row_conversion.cu      | 498 ++++++++++++------
 cpp/tests/row_conversion/row_conversion.cpp   | 106 ----
 3 files changed, 378 insertions(+), 302 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 9fa05c408e5..e1228c9df21 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -25,10 +25,43 @@
 class RowConversion : public cudf::benchmark {
 };
 
-static void BM_to_row(benchmark::State& state)
+static void BM_old_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-/*  auto const table = create_random_table({cudf::type_id::INT8,
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         212,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+    64,
+    row_count{n_rows});*/
+
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    total_bytes += cudf::size_of(t);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto rows = cudf::convert_to_rows(table->view());
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+static void BM_new_to_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
                                           cudf::type_id::INT32,
                                           cudf::type_id::INT16,
                                           cudf::type_id::INT64,
@@ -37,11 +70,11 @@ static void BM_to_row(benchmark::State& state)
                                           cudf::type_id::UINT16,
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
-                                         50,
-                                         row_count{n_rows});*/
-  auto const table = create_random_table({cudf::type_id::INT32},
-  64,
-  row_count{n_rows});
+                                         212,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+    64,
+    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -52,14 +85,13 @@ static void BM_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-//    auto rows = cudf::convert_to_rows(table->view());
     auto new_rows = cudf::convert_to_rows2(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-static void BM_from_row(benchmark::State& state)
+/*static void BM_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -73,9 +105,6 @@ static void BM_from_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          256,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-                                           4,
-                                           row_count{n_rows});*/
 
   std::vector<cudf::data_type> schema;
   cudf::size_type total_bytes = 0;
@@ -94,18 +123,19 @@ static void BM_from_row(benchmark::State& state)
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
-  BENCHMARK_DEFINE_F(RowConversion, name)        \
-  (::benchmark::State & st) { BM_to_row(st); }   \
-  BENCHMARK_REGISTER_F(RowConversion, name)      \
-    ->RangeMultiplier(8)                         \
-    ->Ranges({{1 << 6, 1 << 20}})               \
-    ->UseManualTime()                            \
+}*/
+
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
@@ -116,4 +146,4 @@ TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
+//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 994233a0700..92ba075c316 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -44,7 +44,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-
 /**
  * Copy a simple vector to device memory asynchronously. Be sure to read
  * the data on the same stream as is used to copy it.
@@ -61,10 +60,9 @@ std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &
 }
 
 template <typename T>
-rmm::device_uvector<T> copy_to_dev_async2(
-  const std::vector<T> &input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
+rmm::device_uvector<T> copy_to_dev_async2(const std::vector<T> &input,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource *mr)
 {
   rmm::device_uvector<T> ret(input.size(), stream, mr);
   CUDA_TRY(cudaMemcpyAsync(
@@ -346,7 +344,7 @@ struct block_info {
  * @param block_infos information about the blocks of work
  * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data
- * 
+ *
  */
 __global__ void copy_from_columns(const size_type num_rows,
                                   const size_type num_columns,
@@ -366,92 +364,119 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false;
-  
+  bool debug_print = false;  // blockIdx.x == 70 && threadIdx.x == 448;
+
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
     printf("Column Info:\n");
-    for (int i=0; i<num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n", i, input_data[i], col_sizes[i], col_offsets[i]);
+    for (int i = 0; i < num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n",
+             i,
+             input_data[i],
+             col_sizes[i],
+             col_offsets[i]);
     }
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-/*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-      printf("%d: %d\n", i, row_offsets[i]);
-    }*/
+    /*    printf("Row Offsets:\n");
+        for (int i=0; i<num_rows; ++i) {
+          printf("%d: %d\n", i, row_offsets[i]);
+        }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
-  auto block = block_infos[blockIdx.x];
+  auto block               = block_infos[blockIdx.x];
+  auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
-  uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
+  uint8_t const dest_shim_offset =
+    reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) &
+    7;  // offset for alignment shim in order to match shared memory with final dest
   if (debug_print) {
     printf("outputting to offset %lu\n", output_start_offset);
     printf("dest shim offset is %d\n", dest_shim_offset);
     printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
+    printf("my block is %d,%d -> %d,%d - buffer %d\n",
+           block.start_col,
+           block.start_row,
+           block.end_col,
+           block.end_row,
+           block.buffer_num);
   }
   // each thread is responsible for every threadcount rows of data.
   // the data is copies into shared memory in the final layout.
-  auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
+  auto const real_bytes_in_row =
+    col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
+  auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
+                                           8);  // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
   if (debug_print) {
-    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]);
+    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n",
+           block.end_col,
+           col_offsets[block.end_col],
+           block.end_col,
+           col_sizes[block.end_col],
+           block.start_col,
+           col_offsets[block.start_col]);
     printf("shmem row size %d\n", shmem_row_size);
     printf("validity offset is %d\n", validity_offset);
-    printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row);
+    printf("starting at %d,%d and going to %d, %d\n",
+           block.start_col,
+           block.start_row,
+           block.end_col,
+           block.end_row);
   }
-  for (int col=block.start_col; col<=block.end_col; ++col) {
-    /*if (!col_is_variable) */{
-      uint64_t col_offset = 0;
+  for (int col = block.start_col; col <= block.end_col; ++col) {
+    /*if (!col_is_variable) */ {
+      uint64_t col_offset      = 0;
       cudf::size_type col_size = col_sizes[col];
-      auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
-      if (debug_print) {
-        printf("dest col offset %d\n", dest_col_offset);
-      }
-      for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
+      auto const dest_col_offset =
+        col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
+      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) {
         if (debug_print) {
-          printf("shmem row %d at offset %d\n", row - block.start_row, (row - block.start_row) * shmem_row_size);
+          printf("shmem row %d(%d) at offset %d(%d)\n",
+                 row - block.start_row,
+                 row,
+                 (row - block.start_row) * shmem_row_size,
+                 row * shmem_row_size);
         }
-        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
+        int8_t *shmem_dest =
+          &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
         switch (col_size) {
           case 1: {
-            if (debug_print) {
-              printf("%p <- byte %d\n", shmem_dest, input_data[col][row]);
-            }
-              *shmem_dest = input_data[col][row];
+            if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); }
+            *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
-            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
-            if (debug_print) {
-              printf("%p <- short %d\n", shmem_dest, short_col_input[row]);
-            }
+            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_data[col]);
+            if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); }
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
-            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_data[col]);
             if (debug_print) {
-              printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]);
+              printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]);
             }
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
-            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
-            if (debug_print) {
-              printf("%p <- long %lu\n", shmem_dest, long_col_input[row]);
-            }
+            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_data[col]);
+            if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); }
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
             if (debug_print) {
-                printf("byte for byte copy due to size %d\n", col_size);
-                printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]);
-              }
-              // TODO this should just not be supported for fixed width columns, but just in case...
+              printf("byte for byte copy due to size %d of column %d\n", col_size, col);
+              printf("%p <- input_data[%d] which is %d\n",
+                     shmem_dest,
+                     input_offset,
+                     input_data[col][input_offset]);
+            }
+            // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
             }
@@ -463,11 +488,13 @@ __global__ void copy_from_columns(const size_type num_rows,
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
-        int8_t *valid_byte              = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
+        int8_t *valid_byte =
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
         cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
+        if (debug_print) { printf("Outputting validity to %p\n", valid_byte); }
         // Now copy validity for the column
         if (input_nm[col]) {
           if (bit_is_set(input_nm[col], row)) {
@@ -479,11 +506,11 @@ __global__ void copy_from_columns(const size_type num_rows,
           // It is valid so just set the bit
           atomicOr_block(valid_int, 1 << int_bit_offset);
         }
-      } // end row
+      }  // end row
 
-      col_offset += col_sizes[col] * (block.end_row - block.start_row);
+      col_offset += col_sizes[col] * rows_in_block;
     }
-  } // end col
+  }  // end col
 
   // wait for the data to be totally copied into shared memory
   __syncthreads();
@@ -496,30 +523,75 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride = gridDim.x * 8;
-  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) {
+  auto const thread_stride       = gridDim.x * 8;
+  if (debug_print) {
+    printf("writing final data from %d to %d at stride %d\n",
+           thread_start_offset,
+           shmem_row_size * rows_in_block,
+           thread_stride);
+    printf("rows in block %d\n", rows_in_block);
+  }
+  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block;
+       src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
-    auto const row_offset = row_offsets[block.start_row + output_row_num];
-    auto const col_offset = src_offset % shmem_row_size;
-    int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset];
-    int8_t *input_ptr = &shared_data[src_offset];
-    // the first part and last part of the row is unaligned data copy. This is copied a single byte
-    // at a time.
-    if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
-      // first part of a row, copy single bytes
+    auto const row_offset     = row_offsets[block.start_row + output_row_num];
+    auto const col_offset     = src_offset % shmem_row_size;
+    int8_t *output_ptr        = &output_data[block.buffer_num][row_offset + col_offset];
+    int8_t *input_ptr         = &shared_data[src_offset];
+
+    // three cases to worry about here
+    // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front
+    // 2) last 8-byte part of a large row - some bytes of pad at the end
+    // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front
+    // AND potentially pad at the rear
+
+    // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily.
+    // 1st case is when we're at some even multiple of shmem_row_size offset.
+    // 2nd case is when offset + 8 is some even multiple of shmem_row_size.
+    // must be an 8 byte copy
+
+    // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize?
+    if (real_bytes_in_row + dest_shim_offset <= 8) {
+      // case 3, we want to copy real_bytes_in_row bytes
+      auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf("case 3 - %d single byte final write %p -> %p\n",
+                 num_single_bytes,
+                 &input_ptr[i + dest_shim_offset],
+                 &output_ptr[i]);
+        }
+        output_ptr[i] = input_ptr[i + dest_shim_offset];
+      }
+    } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
+      // first byte with leading pad
       auto const num_single_bytes = 8 - dest_shim_offset;
-      for (auto i=0; i<num_single_bytes; ++i) {
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf(
+            "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]);
+        }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
-    } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) {
-      // last part of a row, copy single bytes
-      auto const num_single_bytes = dest_shim_offset;
-      for (auto i=0; i<num_single_bytes; ++i) {
+    } else if ((src_offset + 8) % shmem_row_size == 0 &&
+               (real_bytes_in_row + dest_shim_offset) % 8 > 0) {
+      // last bytes of a row
+      auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8;
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf("single trailing byte final write %p -> %p\n",
+                 &input_ptr[i + dest_shim_offset],
+                 &output_ptr[i]);
+        }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
     } else {
       // copy 8 bytes aligned
-      const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_ptr);
+      const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_ptr);
+      if (debug_print) {
+        printf(
+          "long final write %p -> %p\n", long_col_input, reinterpret_cast<int64_t *>(output_ptr));
+      }
       *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
     }
   }
@@ -696,13 +768,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
                                                             rmm::cuda_stream_view stream,
                                                             rmm::mr::device_memory_resource *mr)
 {
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough
-  // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes.
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
   constexpr int max_window_height = 1024;
-  const size_type num_columns = tbl.num_columns();
-  const size_type num_rows    = tbl.num_rows();
+  const size_type num_columns     = tbl.num_columns();
+  const size_type num_rows        = tbl.num_rows();
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   auto pretty_print = [](uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
@@ -714,7 +787,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
       printf("%lu Bytes", i);
     }
   };
-  #endif
+#endif
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
@@ -722,11 +795,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   size_t free, total;
-  cudaMemGetInfo( &free, &total );
-  printf("%lu/%lu Memory", free, total);
-  #endif
+  cudaMemGetInfo(&free, &total);
+  printf("%lu/%lu Memory\n", free, total);
+#endif
 
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
@@ -743,45 +816,46 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
-  std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
+  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
-  std::vector<size_type> column_starts; // offset of column inside a row including alignment
-  std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
+  std::vector<size_type> column_starts;  // offset of column inside a row including alignment
+  std::vector<column_view>
+    variable_width_columns;  // list of the variable width columns in the table
   row_sizes.reserve(num_rows);
   row_offsets.reserve(num_rows);
   column_sizes.reserve(num_columns);
-  column_starts.reserve(num_columns+1); // we add a final offset for validity data start
+  column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
   size_type fixed_width_size_per_row = 0;
   for (int col = 0; col < num_columns; ++col) {
-    auto cv = tbl.column(col);
-    auto col_type = cv.type();
+    auto cv          = tbl.column(col);
+    auto col_type    = cv.type();
     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
-    if (nested_type) { variable_width_columns.push_back(cv);}
+    if (nested_type) { variable_width_columns.push_back(cv); }
 
     // a list or string column will write a single uint64
     // of data here for offset/length
     auto col_size = nested_type ? 8 : size_of(col_type);
 
     // align size for this type
-    std::size_t const alignment_needed  = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row                  = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
     column_starts.push_back(fixed_width_size_per_row);
     column_sizes.push_back(col_size);
     fixed_width_size_per_row += col_size;
   }
-  
+
   // When building the columns to return, we have to be mindful of the offset limit in cudf.
   // It is 32-bit and these data columns are capable of surpassing that easily. The data should
   // not be cut off exactly at the limit though due to the validity buffers. The most efficient
   // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
   // we keep track of the cut points for the validity, which we call row batches. If the row
-  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit.
-  // Note that this boundary is for our book-keeping with column pointers and not anything
-  // that the kernel needs to worry about. We cut the output at convienient boundaries
-  // when assembling the outgoing data stream.
+  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+  // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+  // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+  // the outgoing data stream.
   struct row_batch {
     size_type num_bytes;
     size_type row_count;
@@ -798,71 +872,90 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     // will be included in the variable-width data blob at the end of the
     // row.
     return 0;
-/*      auto c = variable_width_columns[col];
-        while (true) {
-          auto col_offsets   = c.child(0).data<size_type>();
-          auto col_data_size = size_of(c.child(1).type());
-          std::size_t alignment_needed  = col_data_size;
-    
-        row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-        if (c.num_children() == 0) {
-          break;
-        }
-        c = c.child(1);
-      }
-*/
+    /*      auto c = variable_width_columns[col];
+            while (true) {
+              auto col_offsets   = c.child(0).data<size_type>();
+              auto col_data_size = size_of(c.child(1).type());
+              std::size_t alignment_needed  = col_data_size;
+
+            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+            if (c.num_children() == 0) {
+              break;
+            }
+            c = c.child(1);
+          }
+    */
   };
 
   uint64_t row_batch_size   = 0;
   uint64_t total_table_size = 0;
-  size_type row_batch_rows = 0;
-  uint64_t row_offset = 0;
+  size_type row_batch_rows  = 0;
+  uint64_t row_offset       = 0;
+
+  auto calculate_validity_size = [](int const num_cols) {
+    // Now we need to add in space for validity
+    // Eventually we can think about nullable vs not nullable, but for now we will just always add
+    // it in
+    return (num_cols + 7) / 8;
+  };
 
-  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate
-  // the size of each row's variable-width data as well.
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
+  // calculate the size of each row's variable-width data and validity as well.
   for (int row = 0; row < num_rows; ++row) {
-    row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row);
-    if (row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+    auto aligned_row_batch_size =
+      detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
+    row_sizes[row] = fixed_width_size_per_row;
+    // validity is byte aligned
+    row_sizes[row] += calculate_validity_size(num_columns);
+    // variable width data is 8-byte aligned
+    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
+                     calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
+
+    if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
-      row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size = 0;
-      row_batch_rows = row_batch_rows & 31;
-      row_offset = 0;
+      row_batches.push_back(
+        row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size         = 0;
+      row_batch_rows         = row_batch_rows & 31;
+      row_offset             = 0;
+      aligned_row_batch_size = 0;
     }
-    row_offset                  = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+    row_offset = detail::align_offset(row_offset, 8);  // rows are 8 byte aligned
     row_offsets.push_back(row_offset);
-    row_batch_size += row_sizes[row];
+    row_batch_size = aligned_row_batch_size + row_sizes[row];
     row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+    total_table_size = detail::align_offset(total_table_size, 8);  // rows are 8 byte aligned
     total_table_size += row_sizes[row];
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
+  printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
     printf("%d: %d rows, ", i, row_batches[i].row_count);
     pretty_print(row_batches[i].num_bytes);
     printf("\n");
   }
-  #endif
+#endif
 
   std::vector<detail::block_info> block_infos;
 
   // block infos are organized with the windows going "down" the columns
   // this provides the most coalescing of memory access
-  int current_window_size      = 0;
+  int current_window_width     = 0;
   int current_window_start_col = 0;
 
   // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) {
+  auto build_blocks = [&block_infos, &row_batches, num_rows](
+                        int const start_col, int const end_col, int const desired_window_height) {
     int current_window_start_row = 0;
     int current_window_row_batch = 0;
-    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-    int i = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int i                        = 0;
     while (i < num_rows) {
       if (rows_left_in_batch == 0) {
         current_window_row_batch++;
@@ -872,9 +965,10 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
       block_infos.emplace_back(
         detail::block_info{start_col,
-                   current_window_start_row,
-                   end_col,
-                   std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
+                           current_window_start_row,
+                           end_col,
+                           std::min(current_window_start_row + window_height - 1, num_rows - 1),
+                           current_window_row_batch});
 
       i += window_height;
       current_window_start_row += window_height;
@@ -882,7 +976,17 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   };
 
-  int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+  int const window_height =
+    std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+#if defined(DEBUG)
+  printf(
+    "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
+    "%d\n",
+    max_window_height,
+    num_rows,
+    row_batches[0].row_count,
+    window_height);
+#endif
 
   int row_size = 0;
 
@@ -891,32 +995,74 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     auto const col_size = column_sizes[col];
 
     // align size for this type
-    std::size_t alignment_needed  = col_size;  // They are the same for fixed width types
-    auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size;
+    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
+    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col  = row_size_aligned + col_size;
 
     if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+#if defined(DEBUG)
+      printf(
+        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "for shared mem size %d\n",
+        row_size_with_this_col * window_height,
+        col,
+        row_size * window_height,
+        current_window_start_col,
+        col - 1,
+        window_height,
+        row_size_with_this_col,
+        row_size,
+        row_size_aligned,
+        shmem_limit_per_block);
+#endif
       // too large, close this window, generate vertical blocks and restart
       build_blocks(current_window_start_col, col - 1, window_height);
-      row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row
+      row_size =
+        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+#if defined(DEBUG)
+      printf(
+        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "or %d)\n",
+        row_size,
+        col_size,
+        row_size + col_size,
+        column_starts[col - 1],
+        column_sizes[col - 1],
+        column_starts[col - 1] + column_sizes[col - 1]);
+#endif
+      row_size += col_size;  // alignment required for shared memory window boundary to match
+                             // alignment of output row
       current_window_start_col = col;
+      current_window_width     = 0;
     } else {
       row_size = row_size_with_this_col;
+      current_window_width++;
     }
   }
 
-  auto validity_offset = detail::align_offset(column_starts.back(), 4);
+#if defined(DEBUG)
+  printf("validity offset will be %d + %d = %d\n",
+         column_starts.back(),
+         column_sizes.back(),
+         column_starts.back() + column_sizes.back());
+#endif
+  auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4);
   column_starts.push_back(validity_offset);
-  
+
   // build last set of blocks
-  if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); }
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
+  }
 
-  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things
+  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while
+  // calculating other things
   std::vector<const int8_t *> input_data;
   std::vector<bitmask_type const *> input_nm;
   for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv = tbl.column(column_number);
+    column_view cv      = tbl.column(column_number);
     auto const col_type = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
     if (!nested_type) {
       input_data.emplace_back(cv.data<int8_t>());
@@ -924,81 +1070,87 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  #if defined(DEBUG)
-  printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row);
+#if defined(DEBUG)
+  printf("%lu windows for %d columns, %d rows to fit in ",
+         block_infos.size(),
+         block_infos[0].end_col - block_infos[0].start_col + 1,
+         block_infos[0].end_row - block_infos[0].start_row);
   pretty_print(shmem_limit_per_block);
   printf(" shared mem(");
   pretty_print(fixed_width_size_per_row);
   printf("/row, %d columns, %d rows, ", num_columns, num_rows);
   pretty_print(total_table_size);
   printf(" total):\n");
-  #endif
+#endif
 
   auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
   auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
   auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts   = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
-  auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
   std::vector<rmm::device_buffer> output_buffers;
   std::vector<int8_t *> output_data;
   output_data.reserve(row_batches.size());
-  for (uint i=0; i<row_batches.size(); ++i) {
+  for (uint i = 0; i < row_batches.size(); ++i) {
     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data   = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(1024);
-  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
-                                                                                num_columns,
-                                                                                dev_input_data.data(),
-                                                                                dev_input_nm.data(),
-                                                                                dev_col_sizes.data(),
-                                                                                dev_col_starts.data(),
-                                                                                dev_block_infos.data(),
-                                                                                dev_row_offsets.data(),
-                                                                                reinterpret_cast<int8_t **>(dev_output_data.data()));
+  dim3 threads(std::min((uint64_t)1024, total_table_size / 8));
+#if defined(DEBUG)
+  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared memory\n");
+#endif
+  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+    num_rows,
+    num_columns,
+    dev_input_data.data(),
+    dev_input_nm.data(),
+    dev_col_sizes.data(),
+    dev_col_starts.data(),
+    dev_block_infos.data(),
+    dev_row_offsets.data(),
+    reinterpret_cast<int8_t **>(dev_output_data.data()));
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
   int offset_offset = 0;
   std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i=0; i<row_batches.size(); ++i) {
-  
+  for (uint i = 0; i < row_batches.size(); ++i) {
     // compute offsets for this row batch
     std::vector<size_type> offset_vals;
     offset_vals.reserve(row_batches[i].row_count + 1);
     size_type cur_offset = 0;
     offset_vals.push_back(cur_offset);
-    for (int row=0; row<row_batches[i].row_count; ++row) {
-      cur_offset += row_sizes[row + offset_offset];
+    for (int row = 0; row < row_batches[i].row_count; ++row) {
+      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
       offset_vals.push_back(cur_offset);
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);  
-    auto offsets =
-      std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
+    auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto offsets     = std::make_unique<column>(
+      data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
-    auto data =
-      std::make_unique<column>(data_type{cudf::type_id::INT8},
-                                row_batches[i].num_bytes,
-                                std::move(output_buffers[i]));
+    auto data = std::make_unique<column>(
+      data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
-      std::move(offsets),
-      std::move(data),
-      0,
-      rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-      stream,
-      mr));
+                                          std::move(offsets),
+                                          std::move(data),
+                                          0,
+                                          rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+                                          stream,
+                                          mr));
   }
-  
+
   return ret;
 }
 
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 818d7a89ddb..c02f83ad1d5 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -21,13 +21,9 @@
 
 #include <thrust/iterator/counting_iterator.h>
 #include <cudf/row_conversion.hpp>
-#include "cudf/lists/lists_column_view.hpp"
-#include "cudf/types.hpp"
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {
 };
-struct RowToColumnTests : public cudf::test::BaseFixture {
-};
 
 TEST_F(ColumnToRowTests, Single)
 {
@@ -112,105 +108,3 @@ TEST_F(ColumnToRowTests, SingleByteWide)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
 }
-
-TEST_F(RowToColumnTests, Single)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Simple)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Tall)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Wide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, SingleByteWide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}

From fdfcb39974fd1b9dc62129f0c5cded34571a3035 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 21 Jun 2021 18:17:45 +0000
Subject: [PATCH 11/80] Updating windows to be generated in a square way so we
 can have more data to write out as 8-byte writes from shared memory. Shuffled
 some of the copy to GPU code up so it can start the copy sooner and hopefully
 won't force stalls. Some bug fixes.

---
 .../row_conversion/row_conversion.cpp         | 15 ++-
 cpp/src/row_conversion/row_conversion.cu      | 96 +++++++++++--------
 2 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index e1228c9df21..d6b195433cf 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -125,7 +125,7 @@ static void BM_new_to_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }*/
 
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
   BENCHMARK_DEFINE_F(RowConversion, name)           \
   (::benchmark::State & st) { f(st); }              \
   BENCHMARK_REGISTER_F(RowConversion, name)         \
@@ -134,8 +134,17 @@ static void BM_new_to_row(benchmark::State& state)
     ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
+    ->Unit(benchmark::kMillisecond);
+
+OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 92ba075c316..3f221e2f716 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -364,7 +364,7 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false;  // blockIdx.x == 70 && threadIdx.x == 448;
+  constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -383,6 +383,7 @@ __global__ void copy_from_columns(const size_type num_rows,
         }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
+  //else { return; }
   auto block               = block_infos[blockIdx.x];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
@@ -416,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            col_sizes[block.end_col],
            block.start_col,
            col_offsets[block.start_col]);
-    printf("shmem row size %d\n", shmem_row_size);
+    printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row);
     printf("validity offset is %d\n", validity_offset);
     printf("starting at %d,%d and going to %d, %d\n",
            block.start_col,
@@ -524,6 +525,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   //
   auto const thread_start_offset = threadIdx.x * 8;
   auto const thread_stride       = gridDim.x * 8;
+  auto const end_offset = shmem_row_size * rows_in_block;
+
   if (debug_print) {
     printf("writing final data from %d to %d at stride %d\n",
            thread_start_offset,
@@ -531,7 +534,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            thread_stride);
     printf("rows in block %d\n", rows_in_block);
   }
-  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block;
+  for (auto src_offset = thread_start_offset; src_offset < end_offset;
        src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
     auto const row_offset     = row_offsets[block.start_row + output_row_num];
@@ -771,7 +774,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
   // Potential optimization for window sizes.
-  constexpr int max_window_height = 1024;
   const size_type num_columns     = tbl.num_columns();
   const size_type num_rows        = tbl.num_rows();
 
@@ -816,6 +818,25 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
+  // Get the pointers to the input columnar data ready
+  std::vector<const int8_t *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  input_data.reserve(num_columns);
+  input_nm.reserve(num_columns);
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv      = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
+
+  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
@@ -847,6 +868,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     fixed_width_size_per_row += col_size;
   }
 
+  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
+
   // When building the columns to return, we have to be mindful of the offset limit in cudf.
   // It is 32-bit and these data columns are capable of surpassing that easily. The data should
   // not be cut off exactly at the limit though due to the validity buffers. The most efficient
@@ -901,17 +925,18 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
+  auto validity_size = calculate_validity_size(num_columns);
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
     row_sizes[row] = fixed_width_size_per_row;
     // validity is byte aligned
-    row_sizes[row] += calculate_validity_size(num_columns);
+    row_sizes[row] += validity_size;
     // variable width data is 8-byte aligned
     row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
                      calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
 
-    if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
       row_batches.push_back(
         row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
@@ -932,7 +957,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-#if defined(DEBUG)
+  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  #if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
@@ -942,6 +969,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
 #endif
 
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i = 0; i < row_batches.size(); ++i) {
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
+  }
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+
   std::vector<detail::block_info> block_infos;
 
   // block infos are organized with the windows going "down" the columns
@@ -976,8 +1013,13 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   };
 
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized
+  // access, but since other blocks will read/write the edges this may not turn out to be overly important.
+  // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size.
+  // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are
+  // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns.
   int const window_height =
-    std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+    std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count);
 #if defined(DEBUG)
   printf(
     "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
@@ -998,20 +1040,21 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     std::size_t alignment_needed = col_size;  // They are the same for fixed width types
     auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
     auto row_size_with_this_col  = row_size_aligned + col_size;
+    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
 
-    if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
 #if defined(DEBUG)
       printf(
         "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
         "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
         "for shared mem size %d\n",
-        row_size_with_this_col * window_height,
+        row_size_with_end_pad * window_height,
         col,
         row_size * window_height,
         current_window_start_col,
         col - 1,
         window_height,
-        row_size_with_this_col,
+        row_size_with_end_pad,
         row_size,
         row_size_aligned,
         shmem_limit_per_block);
@@ -1055,20 +1098,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
   }
 
-  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while
-  // calculating other things
-  std::vector<const int8_t *> input_data;
-  std::vector<bitmask_type const *> input_nm;
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv      = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
 
 #if defined(DEBUG)
   printf("%lu windows for %d columns, %d rows to fit in ",
@@ -1083,26 +1112,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf(" total):\n");
 #endif
 
-  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
-  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
-  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
-
-  std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
-    output_buffers.push_back(std::move(temp));
-  }
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(std::min((uint64_t)1024, total_table_size / 8));
+  dim3 threads(std::min(1024, shmem_limit_per_block / 8));
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
   pretty_print(shmem_limit_per_block);

From 5cf1cf1afccacd0f7c9b0d47596176926b8b0858 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 8 Jul 2021 01:52:36 +0000
Subject: [PATCH 12/80] Adding row to column conversion code. Performance falls
 off a cliff, but starts out reasonably. I haven't looked at this in nsight
 yet.

---
 .../row_conversion/row_conversion.cpp         |  74 +-
 cpp/include/cudf/row_conversion.hpp           |  12 +
 cpp/src/row_conversion/row_conversion.cu      | 759 +++++++++++++-----
 cpp/tests/row_conversion/row_conversion.cpp   | 106 +++
 4 files changed, 748 insertions(+), 203 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index d6b195433cf..7c1f52c5cd6 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -91,7 +91,7 @@ static void BM_new_to_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-/*static void BM_from_row(benchmark::State& state)
+static void BM_old_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -123,36 +123,62 @@ static void BM_new_to_row(benchmark::State& state)
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}*/
-
-#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
-    ->Unit(benchmark::kMillisecond);
+}
+
+static void BM_new_from_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         256,
+                                         row_count{n_rows});
+
+  std::vector<cudf::data_type> schema;
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    schema.push_back(t);
+    total_bytes += cudf::size_of(t);
+  }
+
+  auto rows = cudf::convert_to_rows(table->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto out = cudf::convert_from_rows2(rows, schema);
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
 
-#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)               \
+  (::benchmark::State & st) { f(st); }                  \
+  BENCHMARK_REGISTER_F(RowConversion, name)             \
+    ->RangeMultiplier(8)                                \
+    ->Ranges({{1 << 6, 1 << 20}})                       \
+    ->UseManualTime()                                   \
     ->Unit(benchmark::kMillisecond);
 
-OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
-#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
-  (::benchmark::State & st) { BM_from_row(st); }   \
+  (::benchmark::State & st) { f(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)        \
     ->RangeMultiplier(8)                           \
-    ->Ranges({{1 << 6, 1 << 22}})                  \
+    ->Ranges({{1 << 6, 1 << 20}})                  \
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index f5e2225ad19..282ffa4b0cb 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -48,4 +48,16 @@ std::unique_ptr<cudf::table> convert_from_rows(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::table> convert_from_rows2(
+  cudf::lists_column_view const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows2(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 }  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 3f221e2f716..c0e78a03576 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -30,6 +30,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cudf/row_conversion.hpp>
+#include <tuple>
 #include "cudf/types.hpp"
 #include "rmm/device_buffer.hpp"
 #include "thrust/iterator/counting_iterator.h"
@@ -332,6 +333,20 @@ struct block_info {
   int buffer_num;
 };
 
+// When building the columns to return, we have to be mindful of the offset limit in cudf.
+// It is 32-bit and these data columns are capable of surpassing that easily. The data should
+// not be cut off exactly at the limit though due to the validity buffers. The most efficient
+// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+// we keep track of the cut points for the validity, which we call row batches. If the row
+// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+// the outgoing data stream.
+struct row_batch {
+  size_type num_bytes;
+  size_type row_count;
+};
+
 /**
  * @brief copy data from cudf columns into x format, which is row-based
  *
@@ -364,7 +379,7 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479;
+  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -383,7 +398,7 @@ __global__ void copy_from_columns(const size_type num_rows,
         }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
-  //else { return; }
+  // else { return; }
   auto block               = block_infos[blockIdx.x];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
@@ -403,7 +418,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            block.buffer_num);
   }
   // each thread is responsible for every threadcount rows of data.
-  // the data is copies into shared memory in the final layout.
+  // the data is copied into shared memory in the final layout.
   auto const real_bytes_in_row =
     col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
   auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
@@ -432,7 +447,7 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const dest_col_offset =
         col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
       if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
-      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) {
+      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
         if (debug_print) {
           printf("shmem row %d(%d) at offset %d(%d)\n",
                  row - block.start_row,
@@ -524,8 +539,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride       = gridDim.x * 8;
-  auto const end_offset = shmem_row_size * rows_in_block;
+  auto const thread_stride       = blockDim.x * 8;
+  auto const end_offset          = shmem_row_size * rows_in_block;
 
   if (debug_print) {
     printf("writing final data from %d to %d at stride %d\n",
@@ -559,9 +574,10 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
       for (auto i = 0; i < num_single_bytes; ++i) {
         if (debug_print) {
-          printf("case 3 - %d single byte final write %p -> %p\n",
+          printf("case 3 - %d single byte final write %p(%d) -> %p\n",
                  num_single_bytes,
                  &input_ptr[i + dest_shim_offset],
+                 input_ptr[i + dest_shim_offset],
                  &output_ptr[i]);
         }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
@@ -600,6 +616,237 @@ __global__ void copy_from_columns(const size_type num_rows,
   }
 }
 
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param offsets
+ * @param output_data
+ * @param output_nm
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_to_columns(const size_type num_rows,
+                                const size_type num_columns,
+                                const size_type *offsets,
+                                int8_t **output_data,
+                                cudf::bitmask_type **output_nm,
+                                const size_type *col_sizes,
+                                const size_type *col_offsets,
+                                const block_info *block_infos,
+                                const int8_t *input_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0;
+
+  if (debug_print) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("Column Info:\n");
+    for (int i = 0; i < num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n",
+             i,
+             output_data[i],
+             col_sizes[i],
+             col_offsets[i]);
+    }
+    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+    /*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+    printf("%d: %d\n", i, row_offsets[i]);
+    }*/
+    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+  }
+//  else { return; }
+  auto block               = block_infos[blockIdx.x];
+  auto const rows_in_block = block.end_row - block.start_row + 1;
+  auto const cols_in_block = block.end_col - block.start_col + 1;
+  extern __shared__ int8_t shared_data[];
+
+  // copy data from our block's window to shared memory
+  // offsets information can get us on the row, then we need to know where the column
+  // starts to offset into the row data.
+
+  // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding
+  // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the
+  // thread may copy less than 8 bytes. Even if at the beginning of the window, because
+  // every internal copy is aligned to 8-byte boundaries.
+  //
+  //  thread 0 thread 1 thread 2 thread 3 thread 4 thread 5
+  //  01234567 89abcdef 01234567 89abcdef 01234567 89abcdef
+  //  xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx
+  // |        |        |        |        |        |        |
+  //
+  //
+
+  auto const window_start_quad = col_offsets[block.start_col] / 8;
+  auto const window_end_quad   = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8;
+  auto const window_quad_width = window_end_quad - window_start_quad;
+  auto const total_quads       = window_quad_width * rows_in_block;
+  auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7;
+
+  if (debug_print) {
+    printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]);
+    printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad);
+    printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad);
+  }
+
+  // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't
+  // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will
+  // result in the window edges being duplicated across blocks, but we can copy the padding as well
+  // to speed up our transfers to shared memory.
+  for (int i = threadIdx.x; i < total_quads; i += blockDim.x) {
+    auto const relative_row = i / window_quad_width;
+    auto const absolute_row = relative_row + block.start_row;
+    //auto const row           = i / window_quad_width;
+    auto const offset_in_row = i % window_quad_width * 8;
+    auto const shmem_dest    = &shared_data[i * 8];
+
+    if (debug_print) {
+      printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest);
+      printf("offsets is %p\n", offsets);
+      printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]);
+      printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row);
+    }
+
+    // full 8-byte copy
+    const int64_t *long_col_input =
+      reinterpret_cast<const int64_t *>(&input_data[offsets[absolute_row] + offset_in_row]);
+    if (debug_print) { 
+      printf("which will be address %p\n", long_col_input);
+      printf("%p <- long %lu\n", shmem_dest, *long_col_input); }
+    *reinterpret_cast<int64_t *>(shmem_dest) = *long_col_input;
+  }
+
+  __syncthreads();
+
+  // now we copy from shared memory to final destination.
+  // the data is laid out in rows in shared memory, so the reads
+  // for a column will be "vertical". Because of this and the different
+  // sizes for each column, this portion is handled on row/column basis.
+  // to prevent each thread working on a single row and also to ensure
+  // that all threads can do work in the case of more threads than rows,
+  // we do a global index instead of a double for loop with col/row.
+  for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+    auto const relative_col = index % cols_in_block;
+    auto const relative_row = index / cols_in_block;
+    auto const absolute_col = relative_col + block.start_col;
+    auto const absolute_row = relative_row + block.start_row;
+
+    auto const shared_memory_row_offset = window_quad_width * 8 * relative_row;
+    auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] +
+                                      shared_memory_row_offset + shared_memory_starting_pad;
+    auto const column_size = col_sizes[absolute_col];
+
+    int8_t *shmem_src = &shared_data[shared_memory_offset];
+    int8_t *dst       = &output_data[absolute_col][absolute_row * column_size];
+
+    if (debug_print) {
+      printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
+      " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size,
+    shmem_src, dst) ;
+    }
+    switch (column_size) {
+      case 1: {
+        if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); }
+        *dst = *shmem_src;
+        break;
+      }
+      case 2: {
+        const int16_t *short_col_input = reinterpret_cast<const int16_t *>(shmem_src);
+        if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); }
+        *reinterpret_cast<int16_t *>(dst) = *short_col_input;
+        break;
+      }
+      case 4: {
+        const int32_t *int_col_input = reinterpret_cast<const int32_t *>(shmem_src);
+        if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); }
+        *reinterpret_cast<int32_t *>(dst) = *int_col_input;
+        break;
+      }
+      case 8: {
+        const int64_t *long_col_input = reinterpret_cast<const int64_t *>(shmem_src);
+        if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); }
+        *reinterpret_cast<int64_t *>(dst) = *long_col_input;
+        break;
+      }
+      default: {
+        if (debug_print) {
+          printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col);
+        }
+        // TODO this should just not be supported for fixed width columns, but just in case...
+        for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; }
+        break;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  // now handle validity. Each thread is responsible for 32 rows in a single column.
+  // to prevent indexing issues with a large number of threads, this is compressed
+  // to a single loop like above. TODO: investigate using shared memory here
+  auto const validity_batches_per_col = (num_rows + 31) / 32;
+  auto const validity_batches_total   = validity_batches_per_col * num_columns;
+  if (debug_print) {
+    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows);
+  }
+  for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) {
+    // what column is this?
+    auto const col             = index / validity_batches_per_col;
+    auto const batch           = index % validity_batches_per_col;
+    auto const starting_row    = batch * 32;
+    auto const validity_offset = col_offsets[num_columns] + col / 8;
+
+    if (debug_print) {
+      printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
+    }
+
+    int32_t dst_validity = 0;
+    for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
+      int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
+
+      if (debug_print) {
+        printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset);
+      }
+  
+      auto const val_byte     = *validity_ptr;
+      auto const src_shift    = col % 8;
+      auto const dst_shift    = row % 32;
+      auto const src_bit_mask = 1 << src_shift;
+      if (debug_print) {
+        printf("src bit mask is 0x%x\n", src_bit_mask);
+        printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift);
+        printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift);
+      }
+//      auto const dst_bit_mask = 1 << dst_shift;
+      dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
+      if (debug_print) {
+        printf("validity is now 0x%x\n", dst_validity);
+      }
+    }
+    
+
+    int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[col] + (starting_row / 32));
+    if (debug_print) {
+      printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32));
+      printf("validity to write is %d\n", dst_validity);
+      printf("validity write %p <- %d\n", validity_ptr, dst_validity);
+    }
+    *validity_ptr         = dst_validity;
+  }
+}
+
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
  * @param [in] num_columns the number of columns being copied.
@@ -764,21 +1011,165 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   return align_offset(at_offset, 8);  // 8 bytes (64 bits)
 }
 
-}  // namespace detail
+template <typename iterator>
+static size_type compute_column_information(
+  iterator begin,
+  iterator end,
+  std::vector<size_type> &column_starts,
+  std::vector<size_type> &column_sizes)//,
+  //std::function<void(T)> nested_type_cb)
+{
+  size_type fixed_width_size_per_row = 0;
+  for (auto cv = begin; cv != end; ++cv) {
+    auto col_type    = std::get<0>(*cv);
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+//    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+
+  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  column_starts.push_back(validity_offset);
+
+  return fixed_width_size_per_row;
+}
 
 //#define DEBUG
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr)
+
+static std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
+                                                 std::vector<size_type> const &column_starts,
+                                                 std::vector<row_batch> const &row_batches,
+                                                 size_type const total_number_of_rows,
+                                                 size_type const &shmem_limit_per_block)
 {
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
-  const size_type num_columns     = tbl.num_columns();
-  const size_type num_rows        = tbl.num_rows();
+  std::vector<block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_width     = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
+                        int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int i                        = 0;
+    while (i < total_number_of_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(detail::block_info{
+        start_col,
+        current_window_start_row,
+        end_col,
+        std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
+        current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+  // would be memory cache line sized access, but since other blocks will read/write the edges this
+  // may not turn out to be overly important. For now, we will attempt to build a square window as
+  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
+  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
+  // bytes, not rows or columns.
+  int const window_height = std::min(
+    std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows),
+    row_batches[0].row_count);
+#if defined(DEBUG)
+  printf(
+    "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height "
+    "%d\n",
+    size_type(sqrt(shmem_limit_per_block)) / column_sizes[0],
+    total_number_of_rows,
+    row_batches[0].row_count,
+    window_height);
+#endif
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
+    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col  = row_size_aligned + col_size;
+    auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
+
+    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
+#if defined(DEBUG)
+      printf(
+        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "for shared mem size %d\n",
+        row_size_with_end_pad * window_height,
+        col,
+        row_size * window_height,
+        current_window_start_col,
+        col - 1,
+        window_height,
+        row_size_with_end_pad,
+        row_size,
+        row_size_aligned,
+        shmem_limit_per_block);
+#endif
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size =
+        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+#if defined(DEBUG)
+      printf(
+        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "or %d)\n",
+        row_size,
+        col_size,
+        row_size + col_size,
+        column_starts[col - 1],
+        column_sizes[col - 1],
+        column_starts[col - 1] + column_sizes[col - 1]);
+#endif
+      row_size += col_size;  // alignment required for shared memory window boundary to match
+                             // alignment of output row
+      current_window_start_col = col;
+      current_window_width     = 0;
+    } else {
+      row_size = row_size_with_this_col;
+      current_window_width++;
+    }
+  }
+
+  // build last set of blocks
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height);
+  }
+
+  return block_infos;
+}
+}  // namespace detail
 
 #if defined(DEBUG)
-  auto pretty_print = [](uint64_t i) {
+  void pretty_print(uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
     } else if (i > (1 * 1024 * 1024)) {
@@ -788,9 +1179,19 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     } else {
       printf("%lu Bytes", i);
     }
-  };
+  }
 #endif
 
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr)
+{
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows    = tbl.num_rows();
+
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
@@ -834,8 +1235,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm   = detail::copy_to_dev_async2(input_nm, stream, mr);
 
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
@@ -848,43 +1249,48 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   column_sizes.reserve(num_columns);
   column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
-  size_type fixed_width_size_per_row = 0;
-  for (int col = 0; col < num_columns; ++col) {
-    auto cv          = tbl.column(col);
-    auto col_type    = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+    return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+  });
+
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+    iter,
+    iter + num_columns,
+    column_starts,
+    column_sizes);//,
+//    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
+  /*  size_type fixed_width_size_per_row = 0;
+    for (int col = 0; col < num_columns; ++col) {
+      auto cv          = tbl.column(col);
+      auto col_type    = cv.type();
+      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+      if (nested_type) { variable_width_columns.push_back(cv); }
+
+      // a list or string column will write a single uint64
+      // of data here for offset/length
+      auto col_size = nested_type ? 8 : size_of(col_type);
+
+      // align size for this type
+      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+      column_starts.push_back(fixed_width_size_per_row);
+      column_sizes.push_back(col_size);
+      fixed_width_size_per_row += col_size;
+    }*/
 
-    if (nested_type) { variable_width_columns.push_back(cv); }
+#if defined(DEBUG)
+  printf("validity offset will be %d + %d = %d\n",
+         column_starts.back(),
+         column_sizes.back(),
+         column_starts.back() + column_sizes.back());
+#endif
 
-    // a list or string column will write a single uint64
-    // of data here for offset/length
-    auto col_size = nested_type ? 8 : size_of(col_type);
 
-    // align size for this type
-    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-    column_starts.push_back(fixed_width_size_per_row);
-    column_sizes.push_back(col_size);
-    fixed_width_size_per_row += col_size;
-  }
+  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
 
-  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
-
-  // When building the columns to return, we have to be mindful of the offset limit in cudf.
-  // It is 32-bit and these data columns are capable of surpassing that easily. The data should
-  // not be cut off exactly at the limit though due to the validity buffers. The most efficient
-  // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-  // we keep track of the cut points for the validity, which we call row batches. If the row
-  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-  // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-  // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-  // the outgoing data stream.
-  struct row_batch {
-    size_type num_bytes;
-    size_type row_count;
-  };
-  std::vector<row_batch> row_batches;
+  std::vector<detail::row_batch> row_batches;
 
   auto calculate_variable_width_row_data_size = [](int const row) {
     // each level of variable-width data will add an offset/length
@@ -936,10 +1342,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
                      calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
 
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits<size_type>::max()) {
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
+        (uint64_t)std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
       row_batches.push_back(
-        row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
       row_batch_size         = 0;
       row_batch_rows         = row_batch_rows & 31;
       row_offset             = 0;
@@ -954,12 +1361,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+    row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
   auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
@@ -979,125 +1386,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
   auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
-  std::vector<detail::block_info> block_infos;
-
-  // block infos are organized with the windows going "down" the columns
-  // this provides the most coalescing of memory access
-  int current_window_width     = 0;
-  int current_window_start_col = 0;
-
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, num_rows](
-                        int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int i                        = 0;
-    while (i < num_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(
-        detail::block_info{start_col,
-                           current_window_start_row,
-                           end_col,
-                           std::min(current_window_start_row + window_height - 1, num_rows - 1),
-                           current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
-  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized
-  // access, but since other blocks will read/write the edges this may not turn out to be overly important.
-  // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size.
-  // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are
-  // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns.
-  int const window_height =
-    std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count);
-#if defined(DEBUG)
-  printf(
-    "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
-    "%d\n",
-    max_window_height,
-    num_rows,
-    row_batches[0].row_count,
-    window_height);
-#endif
-
-  int row_size = 0;
-
-  // march each column and build the blocks of appropriate sizes
-  for (int col = 0; col < num_columns; ++col) {
-    auto const col_size = column_sizes[col];
-
-    // align size for this type
-    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
-    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
-    auto row_size_with_this_col  = row_size_aligned + col_size;
-    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
-
-    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
-#if defined(DEBUG)
-      printf(
-        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
-        "for shared mem size %d\n",
-        row_size_with_end_pad * window_height,
-        col,
-        row_size * window_height,
-        current_window_start_col,
-        col - 1,
-        window_height,
-        row_size_with_end_pad,
-        row_size,
-        row_size_aligned,
-        shmem_limit_per_block);
-#endif
-      // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
-      row_size =
-        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-#if defined(DEBUG)
-      printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
-        "or %d)\n",
-        row_size,
-        col_size,
-        row_size + col_size,
-        column_starts[col - 1],
-        column_sizes[col - 1],
-        column_starts[col - 1] + column_sizes[col - 1]);
-#endif
-      row_size += col_size;  // alignment required for shared memory window boundary to match
-                             // alignment of output row
-      current_window_start_col = col;
-      current_window_width     = 0;
-    } else {
-      row_size = row_size_with_this_col;
-      current_window_width++;
-    }
-  }
-
-#if defined(DEBUG)
-  printf("validity offset will be %d + %d = %d\n",
-         column_starts.back(),
-         column_sizes.back(),
-         column_starts.back() + column_sizes.back());
-#endif
-  auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4);
-  column_starts.push_back(validity_offset);
-
-  // build last set of blocks
-  if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
-  }
-
+  std::vector<detail::block_info> block_infos =
+    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
 #if defined(DEBUG)
   printf("%lu windows for %d columns, %d rows to fit in ",
@@ -1116,7 +1406,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(std::min(1024, shmem_limit_per_block / 8));
+  #if defined(DEBUG) || 1
+  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size));
+  #else
+  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size));
+  #endif
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
   pretty_print(shmem_limit_per_block);
@@ -1206,11 +1500,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    zero->set_valid(true, stream);
+    zero->set_valid_async(true, stream);
     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
 
     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    step->set_valid(true, stream);
+    step->set_valid_async(true, stream);
     static_cast<ScalarType *>(step.get())
       ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
 
@@ -1238,6 +1532,97 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   }
 }
 
+std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &input,
+                                                std::vector<cudf::data_type> const &schema,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource *mr)
+{
+  // verify that the types are what we expect
+  cudf::column_view child = input.child();
+  cudf::type_id list_type = child.type().id();
+  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  cudf::size_type num_columns = schema.size();
+  cudf::size_type num_rows    = input.parent().size();
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int shmem_limit_per_block;
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  std::vector<cudf::size_type> column_starts;
+  std::vector<cudf::size_type> column_sizes;
+
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
+    return std::make_tuple(schema[i], nullptr);
+  });
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+    iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
+
+  size_type validity_size = (num_columns + 7) / 8;
+
+  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
+
+  // Ideally we would check that the offsets are all the same, etc. but for now
+  // this is probably fine
+  CUDF_EXPECTS(row_size * num_rows == child.size(),
+               "The layout of the data appears to be off");
+  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+
+  // build the row_batches from the passed in list column
+  std::vector<detail::row_batch> row_batches;
+
+  row_batches.push_back(detail::row_batch{child.size(), num_rows});
+
+  // Allocate the columns we are going to write into
+  std::vector<std::unique_ptr<cudf::column>> output_columns;
+  std::vector<int8_t *> output_data;
+  std::vector<cudf::bitmask_type *> output_nm;
+  for (cudf::size_type i = 0; i < num_columns; i++) {
+    auto column = cudf::make_fixed_width_column(
+      schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
+    auto mut = column->mutable_view();
+    output_data.emplace_back(mut.data<int8_t>());
+    output_nm.emplace_back(mut.null_mask());
+    output_columns.emplace_back(std::move(column));
+  }
+
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_nm   = detail::copy_to_dev_async2(output_nm, stream, mr);
+
+  std::vector<detail::block_info> block_infos =
+    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+
+  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+
+  dim3 blocks(block_infos.size());
+  #if defined(DEBUG) || 1
+  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
+  #else
+  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size()));
+  #endif
+#if defined(DEBUG)
+  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared memory\n");
+#endif
+  detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+    num_rows,
+    num_columns,
+    input.offsets().data<size_type>(),
+    dev_output_data.data(),
+    dev_output_nm.data(),
+    dev_col_sizes.data(),
+    dev_col_starts.data(),
+    dev_block_infos.data(),
+    child.data<int8_t>());
+
+  return std::make_unique<cudf::table>(std::move(output_columns));
+}
+
 std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
                                                std::vector<cudf::data_type> const &schema,
                                                rmm::cuda_stream_view stream,
@@ -1318,4 +1703,20 @@ std::unique_ptr<cudf::table> convert_from_rows(
   //    }
 }
 
+std::unique_ptr<cudf::table> convert_from_rows2(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
+
+  //    for (uint i=0; i<input.size(); ++i) {
+  cudf::lists_column_view lcv = input[0]->view();
+  auto ret                    = convert_from_rows2(lcv, schema, stream, mr);
+
+  return ret;
+  //    }
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index c02f83ad1d5..818d7a89ddb 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -21,9 +21,13 @@
 
 #include <thrust/iterator/counting_iterator.h>
 #include <cudf/row_conversion.hpp>
+#include "cudf/lists/lists_column_view.hpp"
+#include "cudf/types.hpp"
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {
 };
+struct RowToColumnTests : public cudf::test::BaseFixture {
+};
 
 TEST_F(ColumnToRowTests, Single)
 {
@@ -108,3 +112,105 @@ TEST_F(ColumnToRowTests, SingleByteWide)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
 }
+
+TEST_F(RowToColumnTests, Single)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Simple)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Tall)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Wide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, SingleByteWide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}

From 9af3258884f195098aecbf76bb87305ab73113ea Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 8 Jul 2021 20:45:18 +0000
Subject: [PATCH 13/80] updating to use make_device_uvector_async and bitmask
 functions per review comments

---
 cpp/src/row_conversion/row_conversion.cu | 125 +++++++++--------------
 1 file changed, 47 insertions(+), 78 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index c0e78a03576..c73e967cf0f 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -21,6 +21,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -36,6 +37,7 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
+using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
 namespace detail {
@@ -45,32 +47,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-/**
- * Copy a simple vector to device memory asynchronously. Be sure to read
- * the data on the same stream as is used to copy it.
- */
-template <typename T>
-std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
-                                                          rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource *mr)
-{
-  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
-  CUDA_TRY(cudaMemcpyAsync(
-    ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
-  return ret;
-}
-
-template <typename T>
-rmm::device_uvector<T> copy_to_dev_async2(const std::vector<T> &input,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource *mr)
-{
-  rmm::device_uvector<T> ret(input.size(), stream, mr);
-  CUDA_TRY(cudaMemcpyAsync(
-    ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
-  return ret;
-}
-
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type num_columns,
                                             const cudf::size_type row_size,
@@ -180,8 +156,8 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         }
 
         cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
+        cudf::size_type byte_bit_offset = intra_word_index(col_index);
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
         if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
@@ -278,8 +254,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
+        cudf::size_type byte_bit_offset = intra_word_index(col_index);
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -505,8 +481,8 @@ __global__ void copy_from_columns(const size_type num_rows,
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
         int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
-        cudf::size_type byte_bit_offset = col % 8;
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)];
+        cudf::size_type byte_bit_offset = intra_word_index(col);
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -648,7 +624,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0;
+  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -806,7 +782,7 @@ __global__ void copy_to_columns(const size_type num_rows,
     auto const col             = index / validity_batches_per_col;
     auto const batch           = index % validity_batches_per_col;
     auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + col / 8;
+    auto const validity_offset = col_offsets[num_columns] + word_index(col);
 
     if (debug_print) {
       printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
@@ -821,7 +797,7 @@ __global__ void copy_to_columns(const size_type num_rows,
       }
   
       auto const val_byte     = *validity_ptr;
-      auto const src_shift    = col % 8;
+      auto const src_shift    = intra_word_index(col);
       auto const dst_shift    = row % 32;
       auto const src_bit_mask = 1 << src_shift;
       if (debug_print) {
@@ -920,10 +896,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   const cudf::size_type num_rows,
   const cudf::size_type num_columns,
   const cudf::size_type size_per_row,
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
-  std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
-  std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
+  rmm::device_uvector<cudf::size_type> &column_start,
+  rmm::device_uvector<cudf::size_type> &column_size,
+  rmm::device_uvector<const int8_t *> &input_data,
+  rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
   const cudf::scalar &zero,
   const cudf::scalar &scalar_size_per_row,
   rmm::cuda_stream_view stream,
@@ -954,10 +930,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
     num_rows,
     num_columns,
     size_per_row,
-    column_start->data(),
-    column_size->data(),
-    input_data->data(),
-    input_nm->data(),
+    column_start.data(),
+    column_size.data(),
+    input_data.data(),
+    input_nm.data(),
     data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows,
@@ -1004,7 +980,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  int32_t validity_bytes_needed = word_index(schema.size() + 7);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1235,8 +1211,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm   = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+  auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
@@ -1287,8 +1263,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 #endif
 
 
-  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
   std::vector<detail::row_batch> row_batches;
 
@@ -1322,16 +1298,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   size_type row_batch_rows  = 0;
   uint64_t row_offset       = 0;
 
-  auto calculate_validity_size = [](int const num_cols) {
-    // Now we need to add in space for validity
-    // Eventually we can think about nullable vs not nullable, but for now we will just always add
-    // it in
-    return (num_cols + 7) / 8;
-  };
-
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = calculate_validity_size(num_columns);
+  auto validity_size = num_bitmask_words(num_columns);
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -1364,7 +1333,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
 
 #if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
@@ -1384,7 +1353,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   std::vector<detail::block_info> block_infos =
     build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
@@ -1402,7 +1371,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf(" total):\n");
 #endif
 
-  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
@@ -1443,7 +1412,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
     auto offsets     = std::make_unique<column>(
       data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
@@ -1477,8 +1446,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     std::vector<cudf::size_type> column_size;
 
     int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
+    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
 
     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
@@ -1495,8 +1464,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
-    auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr);
-    auto dev_input_nm   = detail::copy_to_dev_async(input_nm, stream, mr);
+    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
@@ -1561,7 +1530,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   size_type fixed_width_size_per_row = detail::compute_column_information(
     iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
 
-  size_type validity_size = (num_columns + 7) / 8;
+  size_type validity_size = num_bitmask_words(num_columns);
 
   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
 
@@ -1569,8 +1538,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   // this is probably fine
   CUDF_EXPECTS(row_size * num_rows == child.size(),
                "The layout of the data appears to be off");
-  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
-  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
@@ -1590,13 +1559,13 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
     output_columns.emplace_back(std::move(column));
   }
 
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
-  auto dev_output_nm   = detail::copy_to_dev_async2(output_nm, stream, mr);
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+  auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
   std::vector<detail::block_info> block_infos =
     build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
-  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   dim3 blocks(block_infos.size());
   #if defined(DEBUG) || 1
@@ -1647,8 +1616,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     // this is probably fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream);
+    auto dev_column_size = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -1663,8 +1632,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       output_columns.emplace_back(std::move(column));
     }
 
-    auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr);
-    auto dev_output_nm   = detail::copy_to_dev_async(output_nm, stream, mr);
+    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
@@ -1675,10 +1644,10 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       num_rows,
       num_columns,
       size_per_row,
-      dev_column_start->data(),
-      dev_column_size->data(),
-      dev_output_data->data(),
-      dev_output_nm->data(),
+      dev_column_start.data(),
+      dev_column_size.data(),
+      dev_output_data.data(),
+      dev_output_nm.data(),
       child.data<int8_t>());
 
     return std::make_unique<cudf::table>(std::move(output_columns));

From 1d0245bffc592f80ba6b4fce7d9bcf9d585eef30 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 13 Jul 2021 07:18:49 +0000
Subject: [PATCH 14/80] updating conversion code. Found out bit operations are
 on 32-bit values, so they can't be used since row data has byte-aligned
 validity. Performance improvements on the row to column side.

---
 cpp/src/row_conversion/row_conversion.cu | 106 ++++++++++++-----------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index c73e967cf0f..0879a1c50a5 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -37,6 +37,8 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
+#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2)
+
 using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
@@ -156,11 +158,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         }
 
         cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
-        cudf::size_type byte_bit_offset = intra_word_index(col_index);
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
+        if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; }
       }  // end column loop
     }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
@@ -254,8 +256,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
-        cudf::size_type byte_bit_offset = intra_word_index(col_index);
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -481,8 +483,8 @@ __global__ void copy_from_columns(const size_type num_rows,
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
         int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)];
-        cudf::size_type byte_bit_offset = intra_word_index(col);
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col  / 8)];
+        cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -597,6 +599,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  *
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
  * @param offsets
  * @param output_data
  * @param output_nm
@@ -608,6 +611,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  */
 __global__ void copy_to_columns(const size_type num_rows,
                                 const size_type num_columns,
+                                const size_type shmem_used_per_block,
                                 const size_type *offsets,
                                 int8_t **output_data,
                                 cudf::bitmask_type **output_nm,
@@ -624,18 +628,10 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
+  constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("Column Info:\n");
-    for (int i = 0; i < num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n",
-             i,
-             output_data[i],
-             col_sizes[i],
-             col_offsets[i]);
-    }
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
     /*    printf("Row Offsets:\n");
     for (int i=0; i<num_rows; ++i) {
@@ -644,7 +640,13 @@ __global__ void copy_to_columns(const size_type num_rows,
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
 //  else { return; }
-  auto block               = block_infos[blockIdx.x];
+
+  for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) {
+    auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset;
+    if (this_block_index > blockDim.x) {
+      break;
+    }
+    auto block               = block_infos[this_block_index];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   auto const cols_in_block = block.end_col - block.start_col + 1;
   extern __shared__ int8_t shared_data[];
@@ -767,61 +769,58 @@ __global__ void copy_to_columns(const size_type num_rows,
     }
   }
 
-  __syncthreads();
-
-  // now handle validity. Each thread is responsible for 32 rows in a single column.
+  // now handle validity. Each thread is responsible for 32 rows in 8 columns.
   // to prevent indexing issues with a large number of threads, this is compressed
   // to a single loop like above. TODO: investigate using shared memory here
   auto const validity_batches_per_col = (num_rows + 31) / 32;
-  auto const validity_batches_total   = validity_batches_per_col * num_columns;
-  if (debug_print) {
-    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows);
+  auto const validity_batches_total   = std::max(1, validity_batches_per_col * (num_columns / 8));
+  if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) {
+    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x);
   }
-  for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) {
-    // what column is this?
-    auto const col             = index / validity_batches_per_col;
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) {
+    auto const start_col       = (index * 8) / validity_batches_per_col;
     auto const batch           = index % validity_batches_per_col;
     auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + word_index(col);
+    auto const validity_offset = col_offsets[num_columns] + (start_col / 8);
 
     if (debug_print) {
-      printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
+      printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x);
     }
 
-    int32_t dst_validity = 0;
+    // one for each column
+    int32_t dst_validity[8] = {0};
     for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
       int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
 
       if (debug_print) {
-        printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset);
+        printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row);
       }
   
       auto const val_byte     = *validity_ptr;
-      auto const src_shift    = intra_word_index(col);
-      auto const dst_shift    = row % 32;
-      auto const src_bit_mask = 1 << src_shift;
-      if (debug_print) {
-        printf("src bit mask is 0x%x\n", src_bit_mask);
-        printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift);
-        printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift);
-      }
-//      auto const dst_bit_mask = 1 << dst_shift;
-      dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
-      if (debug_print) {
-        printf("validity is now 0x%x\n", dst_validity);
+
+      for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
+        auto const src_shift    = (start_col + i) % 8;
+        auto const dst_shift    = row % 32;
+        auto const src_bit_mask = 1 << src_shift;
+        if (debug_print) {
+          printf("%d-%d: src bit mask is 0x%x, src shift is 0x%x and dst shift is 0x%x, validity bit is 0x%x\n", threadIdx.x, blockIdx.x, src_bit_mask, src_shift, dst_shift, (val_byte & src_bit_mask) >> src_shift);
+        }
+  //      auto const dst_bit_mask = 1 << dst_shift;
+        dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
       }
     }
     
 
-    int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[col] + (starting_row / 32));
-    if (debug_print) {
-      printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32));
-      printf("validity to write is %d\n", dst_validity);
-      printf("validity write %p <- %d\n", validity_ptr, dst_validity);
+    for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
+      int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[start_col + i] + (starting_row / 32));
+      if (debug_print) {
+        printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]);
+      }
+      *validity_ptr         = dst_validity[i];
     }
-    *validity_ptr         = dst_validity;
   }
 }
+}
 
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
@@ -980,7 +979,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = word_index(schema.size() + 7);
+  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1300,7 +1299,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = num_bitmask_words(num_columns);
+  auto validity_size = num_bitmask_words(num_columns) * 4;
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -1521,6 +1520,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
+  shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS;
+
   std::vector<cudf::size_type> column_starts;
   std::vector<cudf::size_type> column_sizes;
 
@@ -1530,7 +1531,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   size_type fixed_width_size_per_row = detail::compute_column_information(
     iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
 
-  size_type validity_size = num_bitmask_words(num_columns);
+  size_type validity_size = num_bitmask_words(num_columns) * 4;
 
   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
 
@@ -1567,7 +1568,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
-  dim3 blocks(block_infos.size());
+  dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
   #if defined(DEBUG) || 1
   dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
   #else
@@ -1581,6 +1582,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
     num_rows,
     num_columns,
+    shmem_limit_per_block,
     input.offsets().data<size_type>(),
     dev_output_data.data(),
     dev_output_nm.data(),

From 65490e027df4aa5e55292731434c679f79d0d58b Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 13 Sep 2021 19:46:03 +0000
Subject: [PATCH 15/80] updating for memcpy_async and validation in a different
 kernel

---
 .../row_conversion/row_conversion.cpp         |   47 +-
 cpp/include/cudf/row_conversion.hpp           |   38 +-
 cpp/src/row_conversion/row_conversion.cu      | 1926 ++++++++++++-----
 cpp/tests/row_conversion/row_conversion.cpp   |  132 +-
 java/src/main/native/src/row_conversion.cu    | 1293 ++++++++++-
 java/src/main/native/src/row_conversion.hpp   |   12 +
 6 files changed, 2714 insertions(+), 734 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 7c1f52c5cd6..ad9925e9043 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -20,7 +20,8 @@
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/row_conversion.hpp>
-#include "cudf_test/column_utilities.hpp"
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf_test/column_utilities.hpp>
 
 class RowConversion : public cudf::benchmark {
 };
@@ -39,9 +40,6 @@ static void BM_old_to_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          212,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-    64,
-    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -52,7 +50,7 @@ static void BM_old_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto rows = cudf::convert_to_rows(table->view());
+    auto rows = cudf::old_convert_to_rows(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -72,9 +70,6 @@ static void BM_new_to_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          212,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-    64,
-    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -85,7 +80,7 @@ static void BM_new_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto new_rows = cudf::convert_to_rows2(table->view());
+    auto new_rows = cudf::convert_to_rows(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -114,12 +109,13 @@ static void BM_old_from_row(benchmark::State& state)
     total_bytes += cudf::size_of(t);
   }
 
-  auto rows = cudf::convert_to_rows(table->view());
+  auto rows = cudf::old_convert_to_rows(table->view());
+  cudf::lists_column_view const first_list(rows.front()->view());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto out = cudf::convert_from_rows(rows, schema);
+    auto out = cudf::old_convert_from_rows(first_list, schema);
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -148,36 +144,37 @@ static void BM_new_from_row(benchmark::State& state)
     total_bytes += cudf::size_of(t);
   }
 
-  auto rows = cudf::convert_to_rows(table->view());
+  auto rows = cudf::old_convert_to_rows(table->view());
+  cudf::lists_column_view const first_list(rows.front()->view());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto out = cudf::convert_from_rows2(rows, schema);
+    auto out = cudf::convert_from_rows(first_list, schema);
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
 #define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)               \
-  (::benchmark::State & st) { f(st); }                  \
-  BENCHMARK_REGISTER_F(RowConversion, name)             \
-    ->RangeMultiplier(8)                                \
-    ->Ranges({{1 << 6, 1 << 20}})                       \
-    ->UseManualTime()                                   \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
 TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
 TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)          \
-  (::benchmark::State & st) { f(st); }   \
-  BENCHMARK_REGISTER_F(RowConversion, name)        \
-    ->RangeMultiplier(8)                           \
-    ->Ranges({{1 << 6, 1 << 20}})                  \
-    ->UseManualTime()                              \
+  BENCHMARK_DEFINE_F(RowConversion, name)             \
+  (::benchmark::State & st) { f(st); }                \
+  BENCHMARK_REGISTER_F(RowConversion, name)           \
+    ->RangeMultiplier(8)                              \
+    ->Ranges({{1 << 6, 1 << 20}})                     \
+    ->UseManualTime()                                 \
     ->Unit(benchmark::kMillisecond);
 
 FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index 282ffa4b0cb..8f82d01b06c 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -24,40 +24,28 @@
 
 namespace cudf {
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
-  cudf::table_view const &tbl,
+std::vector<std::unique_ptr<cudf::column>> old_convert_to_rows(
+  cudf::table_view const& tbl,
   // TODO need something for validity
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(
-  cudf::table_view const &tbl,
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
+  cudf::table_view const& tbl,
   // TODO need something for validity
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::table> convert_from_rows(
-  cudf::lists_column_view const &input,
-  std::vector<cudf::data_type> const &schema,
+std::unique_ptr<cudf::table> old_convert_from_rows(
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table> convert_from_rows(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows2(
-  cudf::lists_column_view const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows2(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 0879a1c50a5..42c40e0542d 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -18,26 +18,42 @@
 #include <iostream>
 #include <iterator>
 #include <limits>
+#include <tuple>
+#include <type_traits>
+
+#include <cooperative_groups.h>
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#include <cuda/barrier>
+#endif
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <cudf/row_conversion.hpp>
-#include <tuple>
-#include "cudf/types.hpp"
-#include "rmm/device_buffer.hpp"
-#include "thrust/iterator/counting_iterator.h"
-#include "thrust/iterator/transform_iterator.h"
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 
-#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2)
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS      = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS    = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED          = 2;
+constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL        = 8;
+constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
+#endif
 
 using cudf::detail::make_device_uvector_async;
 namespace cudf {
@@ -52,11 +68,11 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type num_columns,
                                             const cudf::size_type row_size,
-                                            const cudf::size_type *input_offset_in_row,
-                                            const cudf::size_type *num_bytes,
-                                            int8_t **output_data,
-                                            cudf::bitmask_type **output_nm,
-                                            const int8_t *input_data)
+                                            const cudf::size_type* input_offset_in_row,
+                                            const cudf::size_type* num_bytes,
+                                            int8_t** output_data,
+                                            cudf::bitmask_type** output_nm,
+                                            const int8_t* input_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -81,15 +97,15 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
 
   // Because we are copying fixed width only data and we stride the rows
   // this thread will always start copying from shared data in the same place
-  int8_t *row_tmp     = &shared_data[row_size * threadIdx.x];
-  int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+  int8_t* row_tmp     = &shared_data[row_size * threadIdx.x];
+  int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
     // Step 1: Copy the data into shared memory
     // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t *long_shared      = reinterpret_cast<int64_t *>(shared_data);
-    const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
+    int64_t* long_shared      = reinterpret_cast<int64_t*>(shared_data);
+    const int64_t* long_input = reinterpret_cast<int64_t const*>(input_data);
 
     cudf::size_type shared_output_index  = threadIdx.x + (threadIdx.y * blockDim.x);
     cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
@@ -125,26 +141,26 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
         cudf::size_type col_size = num_bytes[col_index];
-        const int8_t *col_tmp    = &(row_tmp[input_offset_in_row[col_index]]);
-        int8_t *col_output       = output_data[col_index];
+        const int8_t* col_tmp    = &(row_tmp[input_offset_in_row[col_index]]);
+        int8_t* col_output       = output_data[col_index];
         switch (col_size) {
           case 1: {
             col_output[row_index] = *col_tmp;
             break;
           }
           case 2: {
-            int16_t *short_col_output   = reinterpret_cast<int16_t *>(col_output);
-            short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
+            int16_t* short_col_output   = reinterpret_cast<int16_t*>(col_output);
+            short_col_output[row_index] = *reinterpret_cast<const int16_t*>(col_tmp);
             break;
           }
           case 4: {
-            int32_t *int_col_output   = reinterpret_cast<int32_t *>(col_output);
-            int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
+            int32_t* int_col_output   = reinterpret_cast<int32_t*>(col_output);
+            int_col_output[row_index] = *reinterpret_cast<const int32_t*>(col_tmp);
             break;
           }
           case 8: {
-            int64_t *long_col_output   = reinterpret_cast<int64_t *>(col_output);
-            long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
+            int64_t* long_col_output   = reinterpret_cast<int64_t*>(col_output);
+            long_col_output[row_index] = *reinterpret_cast<const int64_t*>(col_tmp);
             break;
           }
           default: {
@@ -157,12 +173,12 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
           }
         }
 
-        cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::bitmask_type* nm          = output_nm[col_index];
+        int8_t* valid_byte              = &row_vld_tmp[col_index / 8];
         cudf::size_type byte_bit_offset = col_index % 8;
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; }
+        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
       }  // end column loop
     }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
@@ -174,11 +190,11 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
                                               const cudf::size_type num_rows,
                                               const cudf::size_type num_columns,
                                               const cudf::size_type row_size,
-                                              const cudf::size_type *output_offset_in_row,
-                                              const cudf::size_type *num_bytes,
-                                              const int8_t **input_data,
-                                              const cudf::bitmask_type **input_nm,
-                                              int8_t *output_data)
+                                              const cudf::size_type* output_offset_in_row,
+                                              const cudf::size_type* num_bytes,
+                                              const int8_t** input_data,
+                                              const cudf::bitmask_type** input_nm,
+                                              int8_t* output_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -205,8 +221,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
 
   // Because we are copying fixed width only data and we stride the rows
   // this thread will always start copying to shared data in the same place
-  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-  int8_t *row_vld_tmp =
+  int8_t* row_tmp = &shared_data[row_size * threadIdx.x];
+  int8_t* row_vld_tmp =
     &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
@@ -223,26 +239,26 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
         cudf::size_type col_size = num_bytes[col_index];
-        int8_t *col_tmp          = &(row_tmp[output_offset_in_row[col_index]]);
-        const int8_t *col_input  = input_data[col_index];
+        int8_t* col_tmp          = &(row_tmp[output_offset_in_row[col_index]]);
+        const int8_t* col_input  = input_data[col_index];
         switch (col_size) {
           case 1: {
             *col_tmp = col_input[row_index];
             break;
           }
           case 2: {
-            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(col_input);
-            *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
+            const int16_t* short_col_input       = reinterpret_cast<const int16_t*>(col_input);
+            *reinterpret_cast<int16_t*>(col_tmp) = short_col_input[row_index];
             break;
           }
           case 4: {
-            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(col_input);
-            *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
+            const int32_t* int_col_input         = reinterpret_cast<const int32_t*>(col_input);
+            *reinterpret_cast<int32_t*>(col_tmp) = int_col_input[row_index];
             break;
           }
           case 8: {
-            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(col_input);
-            *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
+            const int64_t* long_col_input        = reinterpret_cast<const int64_t*>(col_input);
+            *reinterpret_cast<int64_t*>(col_tmp) = long_col_input[row_index];
             break;
           }
           default: {
@@ -256,10 +272,10 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        int8_t* valid_byte              = &row_vld_tmp[col_index / 8];
         cudf::size_type byte_bit_offset = col_index % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
-        int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
+        int32_t* valid_int              = reinterpret_cast<int32_t*>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
         // Now copy validity for the column
         if (input_nm[col_index]) {
@@ -279,8 +295,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
 
     // Step 2: Copy the data back out
     // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-    int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
+    int64_t* long_shared = reinterpret_cast<int64_t*>(shared_data);
+    int64_t* long_output = reinterpret_cast<int64_t*>(output_data);
 
     cudf::size_type shared_input_index  = threadIdx.x + (threadIdx.y * blockDim.x);
     cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
@@ -303,12 +319,35 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
   }
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
 struct block_info {
   int start_col;
   int start_row;
   int end_col;
   int end_row;
   int buffer_num;
+
+  __host__ __device__ size_type get_row_size(size_type const* const col_offsets,
+                                             size_type const* const col_sizes,
+                                             bool debug_print = false) const
+  {
+    if (debug_print)
+      printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n",
+             end_col,
+             &col_offsets[end_col],
+             end_col,
+             &col_sizes[end_col],
+             start_col,
+             &col_offsets[start_col],
+             col_offsets[end_col],
+             col_sizes[end_col],
+             col_offsets[start_col]);
+    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
+  }
+  __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
+
+  __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
 };
 
 // When building the columns to return, we have to be mindful of the offset limit in cudf.
@@ -341,13 +380,14 @@ struct row_batch {
  */
 __global__ void copy_from_columns(const size_type num_rows,
                                   const size_type num_columns,
-                                  const int8_t **input_data,
-                                  const bitmask_type **input_nm,
-                                  const size_type *col_sizes,
-                                  const size_type *col_offsets,
-                                  const block_info *block_infos,
-                                  const size_type *row_offsets,
-                                  int8_t **output_data)
+                                  const size_type shmem_used_per_block,
+                                  const size_type num_block_infos,
+                                  const int8_t** input_data,
+                                  const size_type* col_sizes,
+                                  const size_type* col_offsets,
+                                  const block_info* block_infos,
+                                  const size_type* row_offsets,
+                                  int8_t** output_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -357,239 +397,597 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
+  constexpr bool debug_print = false;  // blockIdx.x == 0 && threadIdx.x == 1;
+
+  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  auto group                      = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&block_barrier[i], group.size());
+    }
+  }
+
+  group.sync();
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("Column Info:\n");
-    for (int i = 0; i < num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n",
-             i,
-             input_data[i],
-             col_sizes[i],
-             col_offsets[i]);
-    }
+    printf("col sizes at %p, col offsets at %p, and row offsets at %p\n",
+           col_sizes,
+           col_offsets,
+           row_offsets);
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-    /*    printf("Row Offsets:\n");
-        for (int i=0; i<num_rows; ++i) {
-          printf("%d: %d\n", i, row_offsets[i]);
-        }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+    printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
+    printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
+    printf("group is %d threads\n", group.size());
   }
-  // else { return; }
-  auto block               = block_infos[blockIdx.x];
-  auto const rows_in_block = block.end_row - block.start_row + 1;
+  //  else { return; }
+
+  auto const blocks_remaining =
+    std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS),
+             std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
+                      (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+
+  size_t fetch;
+  size_t subset;
+  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+    // Fetch ahead up to stages_count subsets
+    for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
+      if (debug_print)
+        printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch);
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
+      if (debug_print)
+        printf("block %lu rows %d-%d and cols %d-%d\n",
+               blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch,
+               fetch_block.start_row,
+               fetch_block.end_row,
+               fetch_block.start_col,
+               fetch_block.end_col);
+
+      auto const num_fetch_cols         = fetch_block.num_cols();
+      auto const num_fetch_rows         = fetch_block.num_rows();
+      auto const num_elements_in_block  = num_fetch_cols * num_fetch_rows;
+      auto const fetch_block_row_size   = fetch_block.get_row_size(col_offsets, col_sizes);
+      auto const starting_column_offset = col_offsets[fetch_block.start_col];
+      auto& fetch_barrier               = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
+
+      // wait for the last use of the memory to be completed
+      if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); }
+
+      // to do the copy we need to do n column copies followed by m element copies OR
+      // we have to do m element copies followed by r row copies. When going from column
+      // to row it is much easier to copy by elements first otherwise we would need a running
+      // total of the column sizes for our block, which isn't readily available. This makes it more
+      // appealing to copy element-wise from input data into shared matching the end layout and do
+      // row-based memcopies out.
+
+      for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
+        auto const relative_col = el / num_fetch_rows;
+        auto const relative_row = el % num_fetch_rows;
+        auto const absolute_col = relative_col + fetch_block.start_col;
+        auto const absolute_row = relative_row + fetch_block.start_row;
+        if (debug_print)
+          printf("row %d(%d), col %d(%d), %d fetch rows, element %d\n",
+                 relative_row,
+                 absolute_row,
+                 relative_col,
+                 absolute_col,
+                 num_fetch_rows,
+                 el);
+        auto const col_size            = col_sizes[absolute_col];
+        auto const col_offset          = col_offsets[absolute_col];
+        auto const relative_col_offset = col_offset - starting_column_offset;
+
+        auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
+        auto const input_src     = input_data[absolute_col] + col_size * absolute_row;
+
+        if (debug_print)
+          printf("block %lu to shared chunk %lu. %p <- %p - %d bytes\n",
+                 fetch,
+                 fetch % stages_count,
+                 &shared[fetch % stages_count][shared_offset],
+                 input_src,
+                 col_size);
+
+        // copy the main
+        cuda::memcpy_async(
+          &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier);
+      }
+    }
+
+    auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
+    subset_barrier.arrive_and_wait();
+
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    if (debug_print)
+      printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset);
+
+    /*    auto const rows_in_block  = block.num_rows();
+        auto const cols_in_block  = block.num_cols();*/
+    auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
+    auto const column_offset  = col_offsets[block.start_col];
+
+    // copy entire rows to final dest
+    for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
+         absolute_row += blockDim.x) {
+      auto const relative_row = absolute_row - block.start_row;
+      auto const output_dest =
+        output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
+      if (debug_print)
+        printf("processing row %d\noutput data[%d] is address %p\n",
+               absolute_row,
+               absolute_row,
+               output_dest);
+      auto const shared_offset = block_row_size * relative_row;
+      if (debug_print)
+        printf("memcpy %p <- %p - %d bytes which is row %d\n",
+               output_dest,
+               &shared[subset % stages_count][shared_offset],
+               block_row_size,
+               absolute_row);
+      cuda::memcpy_async(
+        output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier);
+    }
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+    block_barrier[i].arrive_and_wait();
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param offsets
+ * @param output_data pointer to output data, partitioned by data size
+ * @param validity_offsets offset into input data row for validity data
+ * @param block_infos information about the blocks of work
+ * @param num_block_infos number of infos in blocks array
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_validity_from_columns(const size_type num_rows,
+                                           const size_type num_columns,
+                                           const size_type shmem_used_per_block,
+                                           const size_type* row_offsets,
+                                           int8_t** output_data,
+                                           const size_type validity_offset,
+                                           const block_info* block_infos,
+                                           const size_type num_block_infos,
+                                           const bitmask_type** input_nm)
+{
   extern __shared__ int8_t shared_data[];
-  uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
-  uint8_t const dest_shim_offset =
-    reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) &
-    7;  // offset for alignment shim in order to match shared memory with final dest
-  if (debug_print) {
-    printf("outputting to offset %lu\n", output_start_offset);
-    printf("dest shim offset is %d\n", dest_shim_offset);
-    printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
-    printf("my block is %d,%d -> %d,%d - buffer %d\n",
-           block.start_col,
-           block.start_row,
-           block.end_col,
-           block.end_row,
-           block.buffer_num);
+  int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+    shared_data, shared_data + shmem_used_per_block / 2};
+
+  constexpr bool print_debug = false;  //(threadIdx.x==0 || threadIdx.x == 32) && blockIdx.x == 0;
+  //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
+  if (print_debug) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("%d %d - block infos are at %p and my index is %d\n",
+           threadIdx.x,
+           blockIdx.x,
+           block_infos,
+           blockIdx.x);
+    printf("%d %d - input nm is %p, input_nm[0] is at %p\n",
+           threadIdx.x,
+           blockIdx.x,
+           input_nm,
+           input_nm[0]);
+    printf("shared memory is %p to %p\n", shared_data, shared_data + shmem_used_per_block * 2);
+    printf("block infos at %p and this is index %d\n",
+           &block_infos,
+           blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + 0);
+    /*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+    printf("%d: %d\n", i, row_offsets[i]);
+    }*/
   }
-  // each thread is responsible for every threadcount rows of data.
-  // the data is copied into shared memory in the final layout.
-  auto const real_bytes_in_row =
-    col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
-  auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
-                                           8);  // 8 byte alignment required for shared memory rows
-  auto const validity_offset = col_offsets[num_columns];
-  if (debug_print) {
-    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n",
-           block.end_col,
-           col_offsets[block.end_col],
-           block.end_col,
-           col_sizes[block.end_col],
-           block.start_col,
-           col_offsets[block.start_col]);
-    printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row);
-    printf("validity offset is %d\n", validity_offset);
-    printf("starting at %d,%d and going to %d, %d\n",
-           block.start_col,
-           block.start_row,
-           block.end_col,
-           block.end_row);
+  // else { return; }
+
+  // per conversation with DaveB
+  // each thread of warp reads a single int32 of validity - so we read 128 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const blocks_remaining =
+    std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+             (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+
+  if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size());
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+    shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&shared_block_barriers[i], group.size());
+    }
   }
-  for (int col = block.start_col; col <= block.end_col; ++col) {
-    /*if (!col_is_variable) */ {
-      uint64_t col_offset      = 0;
-      cudf::size_type col_size = col_sizes[col];
-      auto const dest_col_offset =
-        col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
-      if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
-      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
-        if (debug_print) {
-          printf("shmem row %d(%d) at offset %d(%d)\n",
-                 row - block.start_row,
-                 row,
-                 (row - block.start_row) * shmem_row_size,
-                 row * shmem_row_size);
-        }
-        int8_t *shmem_dest =
-          &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
-        switch (col_size) {
-          case 1: {
-            if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); }
-            *shmem_dest = input_data[col][row];
-            break;
-          }
-          case 2: {
-            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_data[col]);
-            if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); }
-            *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
-            break;
-          }
-          case 4: {
-            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_data[col]);
-            if (debug_print) {
-              printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]);
-            }
-            *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
-            break;
-          }
-          case 8: {
-            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_data[col]);
-            if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); }
-            *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
-            break;
-          }
-          default: {
-            cudf::size_type input_offset = col_size * row;
-            if (debug_print) {
-              printf("byte for byte copy due to size %d of column %d\n", col_size, col);
-              printf("%p <- input_data[%d] which is %d\n",
-                     shmem_dest,
-                     input_offset,
-                     input_data[col][input_offset]);
-            }
-            // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
-              shmem_dest[b] = input_data[col][b + input_offset];
+
+  group.sync();
+
+  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+    if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+      if (print_debug)
+        printf("%d: waiting at barrier %d\n",
+               threadIdx.x,
+               validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED);
+      shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
+        .arrive_and_wait();
+      if (print_debug) printf("past barrier...\n");
+    }
+    int8_t* this_shared_block = shared_blocks[validity_block % 2];
+    if (print_debug) printf("top of loop for validity block %d\n", validity_block);
+    if (print_debug)
+      printf("reading validity block info %d at %p\n",
+             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
+             &block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]);
+    auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+
+    auto const num_block_cols = block.num_cols();
+    auto const num_block_rows = block.num_rows();
+
+    auto const num_sections_x = (num_block_cols + 31) / 32;
+    auto const num_sections_y = (num_block_rows + 7) / 8;
+    auto const validity_data_row_length =
+      align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
+    auto const total_sections = num_sections_x * num_sections_y;
+
+    if (print_debug) {
+      printf("%d %d - block %d has %d cols, %d rows, %d row length, and %d total sections\n",
+             threadIdx.x,
+             blockIdx.x,
+             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
+             num_block_cols,
+             num_block_rows,
+             validity_data_row_length,
+             total_sections);
+    }
+    int const warp_id          = threadIdx.x / detail::warp_size;
+    int const lane_id          = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+
+    if (print_debug)
+      printf(
+        "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side "
+        "%d\n",
+        threadIdx.x,
+        blockIdx.x,
+        warp_id,
+        total_sections,
+        warps_per_block,
+        blockDim.x,
+        detail::warp_size);
+    // the block is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_block) {
+      // convert to rows and cols
+      auto const section_x = my_section_idx / num_sections_x;
+      auto const section_y = my_section_idx % num_sections_x;
+
+      if (print_debug) printf("working on section %d of %d...\n", section_x, num_sections_x);
+      auto const relative_col = section_x * 32 + lane_id;
+      auto const relative_row = section_y * 8;
+      auto const absolute_col = relative_col + block.start_col;
+      auto const absolute_row = relative_row + block.start_row;
+      auto const cols_left    = num_columns - absolute_col;
+
+      if (print_debug) printf("pre ballot sync...\n");
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
+
+      if (print_debug)
+        printf(
+          "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d absolute)\n",
+          participation_mask,
+          relative_row,
+          absolute_row,
+          relative_col,
+          absolute_col);
+
+      if (absolute_col < num_columns) {
+        if (print_debug)
+          printf(
+            "thread %d's byte is at %p, participation mask is 0x%x for relative row %d(%d real), "
+            "relative col %d(%d absolute)\n",
+            threadIdx.x,
+            &input_nm[absolute_col][absolute_row / 32],
+            participation_mask,
+            relative_row,
+            absolute_row,
+            relative_col,
+            absolute_col);
+        auto my_byte =
+          input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
+
+        if (print_debug)
+          printf(
+            "thread %d's byte is 0x%x, participation mask is 0x%x for relative row %d(%d real), "
+            "relative col %d(%d absolute)\n",
+            threadIdx.x,
+            my_byte & 0xFF,
+            participation_mask,
+            relative_row,
+            absolute_row,
+            relative_col,
+            absolute_col);
+
+        // every thread that is participating in the warp has a byte, but it's column-based
+        // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make
+        // the bytes we actually write.
+        for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+          // lead thread in each warp writes data
+          auto const validity_write_offset =
+            validity_data_row_length * (relative_row + i) + relative_col / 8;
+          if (threadIdx.x % detail::warp_size == 0) {
+            if (print_debug)
+              printf(
+                "%d %d - byte_mask is 0x%x, masked_byte is 0x%x, shared_data_block[%d][%d] = "
+                "0x%x\n",
+                threadIdx.x,
+                blockIdx.x,
+                byte_mask,
+                my_byte & byte_mask,
+                validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
+                validity_write_offset,
+                validity_data);
+            if (cols_left <= 8) {
+              // write byte
+              if (print_debug)
+                printf("writing single byte to shared offset 0x%x which is %p...\n",
+                       validity_write_offset,
+                       &this_shared_block[validity_write_offset]);
+              this_shared_block[validity_write_offset] = validity_data & 0xFF;
+            } else if (cols_left <= 16) {
+              // write int16
+              if (print_debug)
+                printf("writing two bytes to shared offset 0x%x which is %p...\n",
+                       validity_write_offset,
+                       &this_shared_block[validity_write_offset]);
+              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data & 0xFFFF;
+            } else if (cols_left <= 24) {
+              // write int16 and then int8
+              if (print_debug)
+                printf("writing three bytes to shared offset 0x%x which is %p...\n",
+                       validity_write_offset,
+                       &this_shared_block[validity_write_offset]);
+              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data & 0xFFFF;
+              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+            } else {
+              // write int32
+              if (print_debug)
+                printf("writing 4 bytes to shared offset 0x%x which is %p...\n",
+                       validity_write_offset,
+                       &this_shared_block[validity_write_offset]);
+              *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data;
             }
-            break;
           }
         }
+      }
+    }
 
-        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
-        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        // we do this directly in the final location because the entire row may not
-        // fit in shared memory and may require many blocks to process it entirely
-        int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col  / 8)];
-        cudf::size_type byte_bit_offset = col % 8;
-        uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
-        int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
-        cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
-        if (debug_print) { printf("Outputting validity to %p\n", valid_byte); }
-        // Now copy validity for the column
-        if (input_nm[col]) {
-          if (bit_is_set(input_nm[col], row)) {
-            atomicOr_block(valid_int, 1 << int_bit_offset);
-          } else {
-            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
-          }
-        } else {
-          // It is valid so just set the bit
-          atomicOr_block(valid_int, 1 << int_bit_offset);
-        }
-      }  // end row
+    // make sure entire block has finished copy
+    group.sync();
 
-      col_offset += col_sizes[col] * rows_in_block;
+    // now async memcpy the shared memory out to the final destination
+    for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
+      auto const relative_row = row - block.start_row;
+      if (print_debug) {
+        printf(
+          "base output data is %p, row offset is 0x%x, validity offset into row is 0x%x, word "
+          "index of block start is 0x%x\n",
+          output_data[block.buffer_num],
+          row_offsets[row],
+          validity_offset,
+          word_index(block.start_col));
+        printf(
+          "%d %d - row %d/%d/%d col %d-%d - %p = shared_data_block[%d][%d] which is %p -  %d "
+          "bytes\n - %p <- 0x%x\n",
+          threadIdx.x,
+          blockIdx.x,
+          block.start_row,
+          row,
+          block.end_row,
+          block.start_col,
+          block.end_col,
+          output_data[block.buffer_num] + row_offsets[row] + validity_offset +
+            (word_index(block.start_col)),
+          validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
+          validity_data_row_length * relative_row,
+          &this_shared_block[validity_data_row_length * relative_row],
+          util::div_rounding_up_unsafe(num_block_cols, 8),
+          output_data[block.buffer_num] + row_offsets[row] + validity_offset +
+            word_index(block.start_col),
+          this_shared_block[validity_data_row_length * relative_row]);
+      }
+      auto const output_ptr =
+        output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
+      auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+      cuda::memcpy_async(
+        output_ptr,
+        &this_shared_block[validity_data_row_length * relative_row],
+        num_bytes,
+        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+
+      /*      auto const padding_ptr = output_ptr + num_bytes;
+            auto const padding_needed = -reinterpret_cast<int64_t>(padding_ptr) & 7;
+            if (print_debug) printf(
+                "absolute_row: %d, row_offset for this row: 0x%x, validity data bytes: %d, end
+         address: %p, padding bytes %lu\n", row, row_offsets[row], num_bytes, output_ptr +
+         num_bytes, padding_needed); cuda::memcpy_async(padding_ptr, zero, padding_needed,
+         shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+      */
+
+      /*      if (print_debug) {
+              for (int i=0; i<util::div_rounding_up_unsafe(num_block_rows, 8); i+=4) {
+                printf("%d: 0x%02X %02X %02X %02X\n", i * 8,
+         this_shared_block[validity_data_col_length * relative_col + i] & 0xFF,
+         this_shared_block[validity_data_col_length * relative_col + i + 1] & 0xFF,
+         this_shared_block[validity_data_col_length * col + i + 2] & 0xFF,
+         this_shared_block[validity_data_col_length * relative_col + i + 3] & 0xFF);
+              }
+            }*/
     }
-  }  // end col
-
-  // wait for the data to be totally copied into shared memory
-  __syncthreads();
-
-  // Step 2: Copy the data from shared memory to final destination
-  // each block is potentially a slice of the table, so no assumptions
-  // can be made about alignments. We do know that the alignment in shared
-  // memory matches the final destination alignment. Also note that
-  // we are not writing to entirely contiguous destinations as each
-  // row in shared memory may not be an entire row of the destination.
-  //
-  auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride       = blockDim.x * 8;
-  auto const end_offset          = shmem_row_size * rows_in_block;
+    //    if (print_debug) printf("looping...\n");
+  }
 
-  if (debug_print) {
-    printf("writing final data from %d to %d at stride %d\n",
-           thread_start_offset,
-           shmem_row_size * rows_in_block,
-           thread_stride);
-    printf("rows in block %d\n", rows_in_block);
+  //  if (print_debug) printf("leaving...\n");
+  // wait for last blocks of data to arrive
+  for (int validity_block = 0;
+       validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+       ++validity_block) {
+    shared_block_barriers[validity_block].arrive_and_wait();
   }
-  for (auto src_offset = thread_start_offset; src_offset < end_offset;
-       src_offset += thread_stride) {
-    auto const output_row_num = src_offset / shmem_row_size;
-    auto const row_offset     = row_offsets[block.start_row + output_row_num];
-    auto const col_offset     = src_offset % shmem_row_size;
-    int8_t *output_ptr        = &output_data[block.buffer_num][row_offset + col_offset];
-    int8_t *input_ptr         = &shared_data[src_offset];
-
-    // three cases to worry about here
-    // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front
-    // 2) last 8-byte part of a large row - some bytes of pad at the end
-    // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front
-    // AND potentially pad at the rear
-
-    // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily.
-    // 1st case is when we're at some even multiple of shmem_row_size offset.
-    // 2nd case is when offset + 8 is some even multiple of shmem_row_size.
-    // must be an 8 byte copy
-
-    // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize?
-    if (real_bytes_in_row + dest_shim_offset <= 8) {
-      // case 3, we want to copy real_bytes_in_row bytes
-      auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf("case 3 - %d single byte final write %p(%d) -> %p\n",
-                 num_single_bytes,
-                 &input_ptr[i + dest_shim_offset],
-                 input_ptr[i + dest_shim_offset],
-                 &output_ptr[i]);
-        }
-        output_ptr[i] = input_ptr[i + dest_shim_offset];
-      }
-    } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
-      // first byte with leading pad
-      auto const num_single_bytes = 8 - dest_shim_offset;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf(
-            "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]);
-        }
-        output_ptr[i] = input_ptr[i + dest_shim_offset];
-      }
-    } else if ((src_offset + 8) % shmem_row_size == 0 &&
-               (real_bytes_in_row + dest_shim_offset) % 8 > 0) {
-      // last bytes of a row
-      auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf("single trailing byte final write %p -> %p\n",
-                 &input_ptr[i + dest_shim_offset],
-                 &output_ptr[i]);
-        }
-        output_ptr[i] = input_ptr[i + dest_shim_offset];
-      }
-    } else {
-      // copy 8 bytes aligned
-      const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_ptr);
-      if (debug_print) {
-        printf(
-          "long final write %p -> %p\n", long_col_input, reinterpret_cast<int64_t *>(output_ptr));
+}
+
+static __device__ std::tuple<size_type, size_type> get_admin_data_sizes(size_t col_size_size,
+                                                                        size_t col_offset_size,
+                                                                        int const num_cols)
+{
+  auto const col_size_bytes   = num_cols * col_size_size;
+  auto const col_offset_bytes = num_cols * col_offset_size;
+
+  return {col_size_bytes, col_offset_bytes};
+}
+
+/**
+ * @brief ensure `read_ahead` buffer blocks are fetched
+ *
+ * @param fetch_index internal state passed into the function
+ * @param processing_index index where processing is occuring
+ * @param read_ahead_count how many blocks to read ahead
+ * @param max_resident_blocks how many blocks can be loaded at once
+ * @param total_blocks total number of blocks overall
+ * @param block_infos pointer to the block infos
+ * @param col_sizes pointer to column size information
+ * @param col_offsets pointer to the table's column offsets
+ * @param row_offsets pointer to offsets for each row in the table
+ * @param input_data pointer to the input data
+ * @param shared pointer to shared memory
+ * @param group thread group participating in the fetch
+ * @param block_barrier barriers used for each block
+ * @param debug_print
+ * @return
+ */
+static __device__ void fetch_blocks_for_row_to_column(
+  size_t& fetch_index,
+  size_t const processing_index,
+  int const read_ahead_count,
+  int const max_resident_blocks,
+  int const total_blocks,
+  block_info const* const block_infos,
+  size_type const* const col_sizes,
+  size_type const* const col_offsets,
+  size_type const* const row_offsets,
+  int8_t const* const input_data,
+  int8_t* shared[],
+  cooperative_groups::thread_block const group,
+  cuda::barrier<cuda::thread_scope_block>* block_barrier,
+  bool debug_print)
+{
+  for (; fetch_index < static_cast<size_t>(total_blocks) &&
+         fetch_index < (processing_index + read_ahead_count);
+       ++fetch_index) {
+    if (debug_print)
+      printf("fetching block %lu of %d\n",
+             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
+             total_blocks);
+    auto const fetch_block =
+      block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
+    auto const fetch_block_start_row = fetch_block.start_row;
+    auto const fetch_block_end_row   = fetch_block.end_row;
+    auto const starting_col_offset   = col_offsets[fetch_block.start_col];
+
+    auto const fetch_block_row_size         = fetch_block.get_row_size(col_offsets, col_sizes);
+    auto const num_fetch_cols               = fetch_block.num_cols();
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+      sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
+    auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+
+    // if we have fetched all buffers, we need to wait for processing
+    // to complete on them before we can use them again
+    if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); }
+
+    auto shared_row_offset = 0;
+    // copy the data for column sizes
+    if (debug_print)
+      printf("%d: col sizes memcpy_async(group, %p, %p, %d, barrier);\n",
+             threadIdx.x,
+             &shared[fetch_index % max_resident_blocks][shared_row_offset],
+             &col_offsets[fetch_block.start_col],
+             col_size_bytes);
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+      printf("%d-%d fetching to %p with barrier %p\n",
+             threadIdx.x,
+             blockIdx.x,
+             shared[fetch_index % max_resident_blocks],
+             &fetch_barrier);
+    cuda::memcpy_async(group,
+                       &shared[fetch_index % max_resident_blocks][shared_row_offset],
+                       &col_sizes[fetch_block.start_col],
+                       col_size_bytes,
+                       fetch_barrier);
+    shared_row_offset += col_size_bytes;
+    // copy the data for column offsets
+    if (debug_print)
+      printf("%d: offsets memcpy_async(group, %p, %p, %d, barrier);\n",
+             threadIdx.x,
+             &shared[fetch_index % max_resident_blocks][shared_row_offset],
+             &col_offsets[fetch_block.start_col],
+             col_offset_bytes);
+    cuda::memcpy_async(group,
+                       &shared[fetch_index % max_resident_blocks][shared_row_offset],
+                       &col_offsets[fetch_block.start_col],
+                       col_offset_bytes,
+                       fetch_barrier);
+    shared_row_offset += col_offset_bytes;
+    shared_row_offset = align_offset(shared_row_offset, 8);
+
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0 && fetch_block.start_col == 0 &&
+        fetch_block.start_row <= 51 && fetch_block.end_row >= 51) {
+      printf("Input data for col 0 row 51 is 0x");
+      for (int i = 0; i < col_sizes[0]; ++i) {
+        printf("%x ", input_data[row_offsets[51] + col_offsets[0] + i]);
       }
-      *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
+      printf("\n");
+      printf(
+        "this is at offset %d-%d and starting column offset is %d and we're reading %d bytes\n",
+        col_offsets[0],
+        col_offsets[0] + col_sizes[0],
+        starting_col_offset,
+        fetch_block_row_size);
+      auto shared_offset = (51 - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+      printf("destination is %p", &shared[fetch_index % max_resident_blocks][shared_offset]);
+    }
+
+    for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
+         row <= fetch_block_end_row;
+         row += blockDim.x) {
+      auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+      if (debug_print)
+        printf("fetching block %lu to shared chunk %lu. %p <- %p\n",
+               fetch_index,
+               fetch_index % max_resident_blocks,
+               &shared[fetch_index % max_resident_blocks][shared_offset],
+               &input_data[row_offsets[row] + starting_col_offset]);
+      // copy the main
+      cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
+                         &input_data[row_offsets[row] + starting_col_offset],
+                         fetch_block_row_size,
+                         fetch_barrier);
     }
   }
 }
@@ -600,7 +998,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
+ * @param row_offsets
  * @param output_data
  * @param output_nm
  * @param col_sizes array of sizes for each element in a column - one per column
@@ -612,13 +1010,13 @@ __global__ void copy_from_columns(const size_type num_rows,
 __global__ void copy_to_columns(const size_type num_rows,
                                 const size_type num_columns,
                                 const size_type shmem_used_per_block,
-                                const size_type *offsets,
-                                int8_t **output_data,
-                                cudf::bitmask_type **output_nm,
-                                const size_type *col_sizes,
-                                const size_type *col_offsets,
-                                const block_info *block_infos,
-                                const int8_t *input_data)
+                                const size_type* row_offsets,
+                                int8_t** output_data,
+                                const size_type* _col_sizes,
+                                const size_type* _col_offsets,
+                                const block_info* block_infos,
+                                const size_type num_block_infos,
+                                const int8_t* input_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -628,7 +1026,14 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
+  // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
+  // to shared memory for each of the blocks that we work on
+
+  /*constexpr*/ bool debug_print  = false;  // threadIdx.x == 0;
+  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  auto group                      = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -638,189 +1043,387 @@ __global__ void copy_to_columns(const size_type num_rows,
     printf("%d: %d\n", i, row_offsets[i]);
     }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+    printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
+    printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
+    printf("group is %d threads\n", group.size());
   }
-//  else { return; }
+  //  else { return; }
 
-  for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) {
-    auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset;
-    if (this_block_index > blockDim.x) {
-      break;
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&block_barrier[i], group.size());
     }
-    auto block               = block_infos[this_block_index];
-  auto const rows_in_block = block.end_row - block.start_row + 1;
-  auto const cols_in_block = block.end_col - block.start_col + 1;
-  extern __shared__ int8_t shared_data[];
+  }
 
-  // copy data from our block's window to shared memory
-  // offsets information can get us on the row, then we need to know where the column
-  // starts to offset into the row data.
-
-  // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding
-  // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the
-  // thread may copy less than 8 bytes. Even if at the beginning of the window, because
-  // every internal copy is aligned to 8-byte boundaries.
-  //
-  //  thread 0 thread 1 thread 2 thread 3 thread 4 thread 5
-  //  01234567 89abcdef 01234567 89abcdef 01234567 89abcdef
-  //  xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx
-  // |        |        |        |        |        |        |
-  //
-  //
-
-  auto const window_start_quad = col_offsets[block.start_col] / 8;
-  auto const window_end_quad   = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8;
-  auto const window_quad_width = window_end_quad - window_start_quad;
-  auto const total_quads       = window_quad_width * rows_in_block;
-  auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7;
+  group.sync();
 
-  if (debug_print) {
-    printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]);
-    printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad);
-    printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad);
-  }
+  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
+                                   (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
 
-  // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't
-  // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will
-  // result in the window edges being duplicated across blocks, but we can copy the padding as well
-  // to speed up our transfers to shared memory.
-  for (int i = threadIdx.x; i < total_quads; i += blockDim.x) {
-    auto const relative_row = i / window_quad_width;
-    auto const absolute_row = relative_row + block.start_row;
-    //auto const row           = i / window_quad_width;
-    auto const offset_in_row = i % window_quad_width * 8;
-    auto const shmem_dest    = &shared_data[i * 8];
-
-    if (debug_print) {
-      printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest);
-      printf("offsets is %p\n", offsets);
-      printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]);
-      printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row);
-    }
+  auto get_admin_data_sizes = [col_size_size   = sizeof(decltype(*_col_sizes)),
+                               col_offset_size = sizeof(decltype(*_col_offsets))](
+                                int const num_cols,
+                                int const num_rows) -> std::tuple<size_type, size_type> {
+    auto const col_size_bytes   = num_cols * col_size_size;
+    auto const col_offset_bytes = num_cols * col_offset_size;
 
-    // full 8-byte copy
-    const int64_t *long_col_input =
-      reinterpret_cast<const int64_t *>(&input_data[offsets[absolute_row] + offset_in_row]);
-    if (debug_print) { 
-      printf("which will be address %p\n", long_col_input);
-      printf("%p <- long %lu\n", shmem_dest, *long_col_input); }
-    *reinterpret_cast<int64_t *>(shmem_dest) = *long_col_input;
-  }
+    return {col_size_bytes, col_offset_bytes};
+  };
 
-  __syncthreads();
-
-  // now we copy from shared memory to final destination.
-  // the data is laid out in rows in shared memory, so the reads
-  // for a column will be "vertical". Because of this and the different
-  // sizes for each column, this portion is handled on row/column basis.
-  // to prevent each thread working on a single row and also to ensure
-  // that all threads can do work in the case of more threads than rows,
-  // we do a global index instead of a double for loop with col/row.
-  for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
-    auto const relative_col = index % cols_in_block;
-    auto const relative_row = index / cols_in_block;
-    auto const absolute_col = relative_col + block.start_col;
-    auto const absolute_row = relative_row + block.start_row;
-
-    auto const shared_memory_row_offset = window_quad_width * 8 * relative_row;
-    auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] +
-                                      shared_memory_row_offset + shared_memory_starting_pad;
-    auto const column_size = col_sizes[absolute_col];
-
-    int8_t *shmem_src = &shared_data[shared_memory_offset];
-    int8_t *dst       = &output_data[absolute_col][absolute_row * column_size];
-
-    if (debug_print) {
-      printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
-      " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size,
-    shmem_src, dst) ;
-    }
-    switch (column_size) {
-      case 1: {
-        if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); }
-        *dst = *shmem_src;
-        break;
-      }
-      case 2: {
-        const int16_t *short_col_input = reinterpret_cast<const int16_t *>(shmem_src);
-        if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); }
-        *reinterpret_cast<int16_t *>(dst) = *short_col_input;
-        break;
-      }
-      case 4: {
-        const int32_t *int_col_input = reinterpret_cast<const int32_t *>(shmem_src);
-        if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); }
-        *reinterpret_cast<int32_t *>(dst) = *int_col_input;
-        break;
-      }
-      case 8: {
-        const int64_t *long_col_input = reinterpret_cast<const int64_t *>(shmem_src);
-        if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); }
-        *reinterpret_cast<int64_t *>(dst) = *long_col_input;
-        break;
+  if (debug_print)
+    printf("%d blocks remaining -> %d block infos, %d block index\n",
+           blocks_remaining,
+           num_block_infos,
+           blockIdx.x);
+  size_t fetch;
+  size_t subset;
+  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+    // Fetch ahead up to stages_count subsets
+    fetch_blocks_for_row_to_column(fetch,
+                                   subset,
+                                   stages_count,
+                                   stages_count,
+                                   blocks_remaining,
+                                   block_infos,
+                                   _col_sizes,
+                                   _col_offsets,
+                                   row_offsets,
+                                   input_data,
+                                   shared,
+                                   group,
+                                   block_barrier,
+                                   debug_print);
+
+    auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
+    // ensure our data is ready
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+      printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
+    subset_barrier.arrive_and_wait();
+
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+      printf("%d-%d reading block %lu at address %p\n",
+             threadIdx.x,
+             blockIdx.x,
+             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset,
+             shared[subset % stages_count]);
+
+    auto const rows_in_block = block.num_rows();
+    auto const cols_in_block = block.num_cols();
+
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block);
+    // auto shared_row_offsets = shared[subset];
+    auto shared_col_sizes = reinterpret_cast<size_type*>(shared[subset % stages_count]);
+    auto shared_col_offsets =
+      reinterpret_cast<size_type*>(&shared[subset % stages_count][col_size_bytes]);
+
+    auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
+
+    auto block_row_size = block.get_row_size(_col_offsets, _col_sizes, debug_print);
+
+    // now we copy from shared memory to final destination.
+    // the data is laid out in rows in shared memory, so the reads
+    // for a column will be "vertical". Because of this and the different
+    // sizes for each column, this portion is handled on row/column basis.
+    // to prevent each thread working on a single row and also to ensure
+    // that all threads can do work in the case of more threads than rows,
+    // we do a global index instead of a double for loop with col/row.
+    for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+      auto const relative_col = index % cols_in_block;
+      auto const relative_row = index / cols_in_block;
+      auto const absolute_col = relative_col + block.start_col;
+      auto const absolute_row = relative_row + block.start_row;
+
+      if (debug_print)
+        printf("copying for row %d(%d absolute) col %d(%d absolute)\n",
+               relative_row,
+               absolute_row,
+               relative_col,
+               absolute_col);
+
+      auto const shared_memory_row_offset = block_row_size * relative_row;
+      if (debug_print)
+        printf("shared_col_offsets is %p and relative col is %d, making me access %p\n",
+               shared_col_offsets,
+               relative_col,
+               &shared_col_offsets[relative_col]);
+      auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
+                                        shared_memory_row_offset + shared_row_offset;
+      if (debug_print)
+        printf("shared_col_sizes is %p and relative col is %d, making me access %p\n",
+               shared_col_sizes,
+               relative_col,
+               &shared_col_sizes[relative_col]);
+      auto const column_size = shared_col_sizes[relative_col];
+
+      int8_t* shmem_src = &shared[subset % stages_count][shared_memory_offset];
+      int8_t* dst       = &output_data[absolute_col][absolute_row * column_size];
+
+      if (debug_print) {
+        printf(
+          "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, "
+          "shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
+          " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n",
+          relative_col,
+          relative_row,
+          absolute_col,
+          absolute_row,
+          shared_memory_row_offset,
+          shared_memory_offset,
+          column_size,
+          shmem_src,
+          dst/*,
+          *reinterpret_cast<uint32_t*>(shmem_src)*/);
+        printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size);
       }
-      default: {
-        if (debug_print) {
-          printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col);
+      if (debug_print && absolute_col == 0 && absolute_row == 51) {
+        printf("col0row51(%d bytes) = %p - 0x", column_size, shmem_src);
+        for (int i = 0; i < column_size; ++i) {
+          printf("%x ", shmem_src[i]);
         }
-        // TODO this should just not be supported for fixed width columns, but just in case...
-        for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; }
-        break;
+        printf("\n");
       }
+
+      cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
     }
+    group.sync();
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+      printf(
+        "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+    block_barrier[i].arrive_and_wait();
   }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param offsets
+ * @param output_nm
+ * @param validity_offsets offset into input data row for validity data
+ * @param block_infos information about the blocks of work
+ * @param num_block_infos number of infos in blocks array
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_validity_to_columns(const size_type num_rows,
+                                         const size_type num_columns,
+                                         const size_type shmem_used_per_block,
+                                         const size_type* row_offsets,
+                                         cudf::bitmask_type** output_nm,
+                                         const size_type validity_offset,
+                                         const block_info* block_infos,
+                                         const size_type num_block_infos,
+                                         const int8_t* input_data)
+{
+  extern __shared__ int8_t shared_data[];
+  int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+    shared_data, shared_data + shmem_used_per_block / 2};
 
-  // now handle validity. Each thread is responsible for 32 rows in 8 columns.
-  // to prevent indexing issues with a large number of threads, this is compressed
-  // to a single loop like above. TODO: investigate using shared memory here
-  auto const validity_batches_per_col = (num_rows + 31) / 32;
-  auto const validity_batches_total   = std::max(1, validity_batches_per_col * (num_columns / 8));
-  if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) {
-    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x);
+  bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
+  // bool print_debug = false;
+  //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
+  if (print_debug) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("%d %d - block infos are at %p and my index is %d\n",
+           threadIdx.x,
+           blockIdx.x,
+           block_infos,
+           blockIdx.x);
+    printf(
+      "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, row "
+      "offsets are %p, block infos at %p\n",
+      threadIdx.x,
+      blockIdx.x,
+      shared_data,
+      shared_data + shmem_used_per_block,
+      input_data,
+      output_nm,
+      row_offsets,
+      block_infos);
+    /*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+    printf("%d: %d\n", i, row_offsets[i]);
+    }*/
   }
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) {
-    auto const start_col       = (index * 8) / validity_batches_per_col;
-    auto const batch           = index % validity_batches_per_col;
-    auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + (start_col / 8);
-
-    if (debug_print) {
-      printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x);
+  // else { return; }
+
+  // per conversation with DaveB
+  // each thread of warp reads a single byte of validity - so we read 32 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const blocks_remaining =
+    std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+             (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+
+  if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size());
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+    shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&shared_block_barriers[i], group.size());
     }
+  }
 
-    // one for each column
-    int32_t dst_validity[8] = {0};
-    for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
-      int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
+  group.sync();
 
-      if (debug_print) {
-        printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row);
-      }
-  
-      auto const val_byte     = *validity_ptr;
-
-      for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
-        auto const src_shift    = (start_col + i) % 8;
-        auto const dst_shift    = row % 32;
-        auto const src_bit_mask = 1 << src_shift;
-        if (debug_print) {
-          printf("%d-%d: src bit mask is 0x%x, src shift is 0x%x and dst shift is 0x%x, validity bit is 0x%x\n", threadIdx.x, blockIdx.x, src_bit_mask, src_shift, dst_shift, (val_byte & src_bit_mask) >> src_shift);
+  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+    auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+    if (validity_block != validity_index) {
+      shared_block_barriers[validity_index].arrive_and_wait();
+    }
+    int8_t* this_shared_block = shared_blocks[validity_block % 2];
+    auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+    auto const block_start_col = block.start_col;
+    auto const block_start_row = block.start_row;
+
+    auto const num_block_cols = block.num_cols();
+    auto const num_block_rows = block.num_rows();
+
+    auto const num_sections_x           = (num_block_cols + 7) / 8;
+    auto const num_sections_y           = (num_block_rows + 31) / 32;
+    auto const validity_data_col_length = align_offset(num_sections_y, 4);
+    auto const total_sections           = num_sections_x * num_sections_y;
+
+    if (print_debug) {
+      printf("%d %d - block %d has %d cols, %d rows, and %d total sections\n",
+             threadIdx.x,
+             blockIdx.x,
+             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
+             num_block_cols,
+             num_block_rows,
+             total_sections);
+    }
+    int const warp_id          = threadIdx.x / detail::warp_size;
+    int const lane_id          = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+
+    if (print_debug)
+      printf(
+        "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side "
+        "%d\n",
+        threadIdx.x,
+        blockIdx.x,
+        warp_id,
+        total_sections,
+        warps_per_block,
+        blockDim.x,
+        detail::warp_size);
+    // the block is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_block) {
+      // convert to rows and cols
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
+
+      auto const relative_col = section_x * 8;
+      auto const relative_row = section_y * 32 + lane_id;
+      auto const absolute_col = relative_col + block_start_col;
+      auto const absolute_row = relative_row + block_start_row;
+      auto const rows_left    = num_rows - absolute_row;
+
+      if (print_debug)
+        printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
+               threadIdx.x,
+               blockIdx.x,
+               my_section_idx,
+               num_sections_x,
+               num_sections_y,
+               section_x,
+               section_y,
+               absolute_row,
+               num_rows,
+               relative_col,
+               relative_row);
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
+
+      if (absolute_row < num_rows) {
+        auto const my_byte =
+          input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
+
+        // so every thread that is participating in the warp has a byte, but it's row-based
+        // data and we need it in column-based. So we shiffle the bits around to make
+        // the bytes we actually write.
+        for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
+             ++i, byte_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+          // lead thread in each warp writes data
+          if (threadIdx.x % detail::warp_size == 0) {
+            auto const validity_write_offset =
+              validity_data_col_length * (relative_col + i) + relative_row / 8;
+
+            if (print_debug)
+              printf("%d - Writing validity data 0x%x to shared memory location %d\n",
+                     threadIdx.x,
+                     validity_data,
+                     validity_write_offset);
+            if (rows_left <= 8) {
+              // write byte
+              this_shared_block[validity_write_offset] = validity_data & 0xFF;
+            } else if (rows_left <= 16) {
+              // write int16
+              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data & 0xFFFF;
+            } else if (rows_left <= 24) {
+              // write int16 and then int8
+              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data & 0xFFFF;
+              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+            } else {
+              // write int32
+              *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data;
+            }
+          }
         }
-  //      auto const dst_bit_mask = 1 << dst_shift;
-        dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
       }
     }
-    
 
-    for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
-      int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[start_col + i] + (starting_row / 32));
-      if (debug_print) {
-        printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]);
-      }
-      *validity_ptr         = dst_validity[i];
+    // make sure entire block has finished copy
+    group.sync();
+
+    // now async memcpy the shared
+    for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
+      auto const relative_col     = col - block.start_col;
+      auto const words_to_copy    = util::div_rounding_up_unsafe(num_block_rows, 32);
+      auto const starting_address = output_nm[col] + word_index(block_start_row);
+
+      if (print_debug)
+        printf("memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
+               starting_address,
+               word_index(block_start_row),
+               &this_shared_block[validity_data_col_length * relative_col],
+               words_to_copy * 4);
+      cuda::memcpy_async(
+        output_nm[col] + word_index(block_start_row),
+        &this_shared_block[validity_data_col_length * relative_col],
+        util::div_rounding_up_unsafe(num_block_rows, 8),
+        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
     }
   }
+
+  //  if (print_debug) printf("leaving...\n");
+  // wait for last blocks of data to arrive
+  auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
+                                    ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
+                                    : blocks_remaining;
+  for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
+    shared_block_barriers[validity_block].arrive_and_wait();
+  }
 }
-}
+
+#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
@@ -834,8 +1437,8 @@ __global__ void copy_to_columns(const size_type num_rows,
 static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
                                         const cudf::size_type num_rows,
                                         const cudf::size_type size_per_row,
-                                        dim3 &blocks,
-                                        dim3 &threads)
+                                        dim3& blocks,
+                                        dim3& threads)
 {
   // We have found speed degrades when a thread handles more than 4 columns.
   // Each block is 2 dimensional. The y dimension indicates the columns.
@@ -846,7 +1449,7 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
   // in the x dimension because we use atomic operations at the block
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
-  int y_block_size = (num_columns + 3) / 4;
+  int y_block_size = (num_columns + 3) / 4;  // cudf::util::div_rounding_up_safe(num_columns, 4);
   if (y_block_size > 32) { y_block_size = 32; }
   int x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
@@ -895,14 +1498,14 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   const cudf::size_type num_rows,
   const cudf::size_type num_columns,
   const cudf::size_type size_per_row,
-  rmm::device_uvector<cudf::size_type> &column_start,
-  rmm::device_uvector<cudf::size_type> &column_size,
-  rmm::device_uvector<const int8_t *> &input_data,
-  rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
-  const cudf::scalar &zero,
-  const cudf::scalar &scalar_size_per_row,
+  rmm::device_uvector<cudf::size_type>& column_start,
+  rmm::device_uvector<cudf::size_type>& column_size,
+  rmm::device_uvector<const int8_t*>& input_data,
+  rmm::device_uvector<const cudf::bitmask_type*>& input_nm,
+  const cudf::scalar& zero,
+  const cudf::scalar& scalar_size_per_row,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
+  rmm::mr::device_memory_resource* mr)
 {
   int64_t total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
@@ -944,12 +1547,12 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
                                  mr);
 }
 
-static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); }
+static cudf::data_type get_data_type(const cudf::column_view& v) { return v.type(); }
 
-static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema)
+static inline bool are_all_fixed_width(std::vector<cudf::data_type> const& schema)
 {
   return std::all_of(
-    schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
+    schema.begin(), schema.end(), [](const cudf::data_type& t) { return cudf::is_fixed_width(t); });
 }
 
 /**
@@ -959,9 +1562,9 @@ static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schem
  * @param [out] column_size the size in bytes of the data for each columns in the row.
  * @return the size in bytes each row needs.
  */
-static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
-                                                 std::vector<cudf::size_type> &column_start,
-                                                 std::vector<cudf::size_type> &column_size)
+static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const& schema,
+                                                 std::vector<cudf::size_type>& column_start,
+                                                 std::vector<cudf::size_type>& column_size)
 {
   // We guarantee that the start of each column is 64-bit aligned so anything can go
   // there, but to make the code simple we will still do an alignment for it.
@@ -979,27 +1582,29 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  int32_t validity_bytes_needed =
+    (schema.size() + 7) / 8;  // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
   return align_offset(at_offset, 8);  // 8 bytes (64 bits)
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
 template <typename iterator>
-static size_type compute_column_information(
-  iterator begin,
-  iterator end,
-  std::vector<size_type> &column_starts,
-  std::vector<size_type> &column_sizes)//,
-  //std::function<void(T)> nested_type_cb)
+static size_type compute_column_information(iterator begin,
+                                            iterator end,
+                                            std::vector<size_type>& column_starts,
+                                            std::vector<size_type>& column_sizes)  //,
+// std::function<void(T)> nested_type_cb)
 {
   size_type fixed_width_size_per_row = 0;
   for (auto cv = begin; cv != end; ++cv) {
     auto col_type    = std::get<0>(*cv);
     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
-//    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+    //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
 
     // a list or string column will write a single uint64
     // of data here for offset/length
@@ -1021,11 +1626,53 @@ static size_type compute_column_information(
 
 //#define DEBUG
 
-static std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
-                                                 std::vector<size_type> const &column_starts,
-                                                 std::vector<row_batch> const &row_batches,
-                                                 size_type const total_number_of_rows,
-                                                 size_type const &shmem_limit_per_block)
+std::vector<detail::block_info> build_validity_block_infos(
+  size_type const& num_columns,
+  size_type const& num_rows,
+  size_type const& shmem_limit_per_block,
+  std::vector<row_batch> const& row_batches)
+{
+  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+  auto const column_stride            = align_offset(
+    [&]() {
+      if (desired_rows_and_columns > num_columns) {
+        // not many columns, group it into 8s and ship it off
+        return std::min(8, num_columns);
+      } else {
+        return util::round_down_safe(desired_rows_and_columns, 8);
+      }
+    }(),
+    8);
+  // we fit as much as we can given the column stride
+  auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride);
+
+  std::vector<detail::block_info> validity_block_infos;
+  for (int col = 0; col < num_columns; col += column_stride) {
+    int current_window_row_batch = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int row                      = 0;
+    while (row < num_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(row_stride, rows_left_in_batch);
+
+      validity_block_infos.emplace_back(detail::block_info{
+        col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
+      row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  }
+
+  return validity_block_infos;
+}
+
+std::vector<block_info> build_block_infos(std::vector<size_type> const& column_sizes,
+                                          std::vector<size_type> const& column_starts,
+                                          std::vector<row_batch> const& row_batches,
+                                          size_type const total_number_of_rows,
+                                          size_type const& shmem_limit_per_block)
 {
   std::vector<block_info> block_infos;
 
@@ -1067,19 +1714,37 @@ static std::vector<block_info> build_block_infos(std::vector<size_type> const &c
   // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
   // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
   // bytes, not rows or columns.
-  int const window_height = std::min(
-    std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows),
-    row_batches[0].row_count);
+  size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
+  int const window_height =
+    std::clamp(util::round_up_safe<int>(
+                 optimal_square_len <= (size_type)column_sizes.size()
+                   ? std::min(optimal_square_len / column_sizes[0], total_number_of_rows)
+                   : row_batches[0].row_count / 2,
+                 32),
+               1,
+               row_batches[0].row_count);
 #if defined(DEBUG)
   printf(
-    "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height "
-    "%d\n",
-    size_type(sqrt(shmem_limit_per_block)) / column_sizes[0],
+    "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d "
+    "and num_rows is %d, batch row count is %d "
+    "- which makes window height "
+    "%d - admin size is %lu\n",
+    optimal_square_len,
+    (int)column_sizes.size(),
+    optimal_square_len / column_sizes[0],
     total_number_of_rows,
     row_batches[0].row_count,
-    window_height);
+    window_height,
+    column_sizes.size() * sizeof(size_type) * 2);
 #endif
 
+  auto calc_admin_data_size = [](int num_cols) -> size_type {
+    // admin data is the column sizes and column start information.
+    // this is copied to shared memory as well and needs to be accounted for
+    // in the window calculation.
+    return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
+  };
+
   int row_size = 0;
 
   // march each column and build the blocks of appropriate sizes
@@ -1092,14 +1757,26 @@ static std::vector<block_info> build_block_infos(std::vector<size_type> const &c
     auto row_size_with_this_col  = row_size_aligned + col_size;
     auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
 
-    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
+    if (row_size_with_end_pad * window_height +
+          calc_admin_data_size(col - current_window_start_col) >
+        shmem_limit_per_block) {
 #if defined(DEBUG)
       printf(
-        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "row size with end pad is %d and admin data is %d, which adds up to %d and that is too "
+        "large for shmem block of %d\n",
+        row_size_with_end_pad,
+        calc_admin_data_size(col - current_window_start_col),
+        row_size_with_end_pad * window_height +
+          calc_admin_data_size(col - current_window_start_col),
+        shmem_limit_per_block);
+      printf(
+        "Window size %d too large at column %d, admin size is %d, bumping back to build windows of "
+        "size %d(cols "
         "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
         "for shared mem size %d\n",
         row_size_with_end_pad * window_height,
         col,
+        calc_admin_data_size(col - current_window_start_col),
         row_size * window_height,
         current_window_start_col,
         col - 1,
@@ -1136,31 +1813,35 @@ static std::vector<block_info> build_block_infos(std::vector<size_type> const &c
 
   // build last set of blocks
   if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height);
+    build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
   }
 
   return block_infos;
 }
-}  // namespace detail
 
 #if defined(DEBUG)
-  void pretty_print(uint64_t i) {
-    if (i > (1 * 1024 * 1024 * 1024)) {
-      printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
-    } else if (i > (1 * 1024 * 1024)) {
-      printf("%.2f MB", i / float(1 * 1024 * 1024));
-    } else if (i > (1 * 1024)) {
-      printf("%.2f KB", float(i / 1024));
-    } else {
-      printf("%lu Bytes", i);
-    }
+void pretty_print(uint64_t i)
+{
+  if (i > (1 * 1024 * 1024 * 1024)) {
+    printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
+  } else if (i > (1 * 1024 * 1024)) {
+    printf("%.2f MB", i / float(1 * 1024 * 1024));
+  } else if (i > (1 * 1024)) {
+    printf("%.2f KB", float(i / 1024));
+  } else {
+    printf("%lu Bytes", i);
   }
+}
 #endif
+#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+}  // namespace detail
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr)
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const& tbl,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr)
 {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
   // Potential optimization for window sizes.
@@ -1169,9 +1850,13 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
-  int shmem_limit_per_block;
-  CUDA_TRY(
-    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+  int total_shmem;
+  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+#if defined(DEBUG) || 1
+  total_shmem -= 1024;
+#endif
+  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
 
 #if defined(DEBUG)
   size_t free, total;
@@ -1195,8 +1880,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // windows so the windows can be properly cut around them.
 
   // Get the pointers to the input columnar data ready
-  std::vector<const int8_t *> input_data;
-  std::vector<bitmask_type const *> input_nm;
+  std::vector<int8_t const*> input_data;
+  std::vector<bitmask_type const*> input_nm;
   input_data.reserve(num_columns);
   input_nm.reserve(num_columns);
   for (size_type column_number = 0; column_number < num_columns; column_number++) {
@@ -1224,16 +1909,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   column_sizes.reserve(num_columns);
   column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
-  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-    return std::make_tuple(tbl.column(i).type(), tbl.column(i));
-  });
+  auto iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+      return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+    });
 
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter,
-    iter + num_columns,
-    column_starts,
-    column_sizes);//,
-//    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
+  size_type fixed_width_size_per_row = detail::compute_column_information(iter,
+                                                                          iter + num_columns,
+                                                                          column_starts,
+                                                                          column_sizes);  //,
+  //    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
   /*  size_type fixed_width_size_per_row = 0;
     for (int col = 0; col < num_columns; ++col) {
       auto cv          = tbl.column(col);
@@ -1261,7 +1946,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
          column_starts.back() + column_sizes.back());
 #endif
 
-
   auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
@@ -1329,7 +2013,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+    row_batches.push_back(
+      detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
   auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
@@ -1339,17 +2024,17 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
     printf("%d: %d rows, ", i, row_batches[i].row_count);
-    pretty_print(row_batches[i].num_bytes);
+    detail::pretty_print(row_batches[i].num_bytes);
     printf("\n");
   }
 #endif
 
   std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
+  std::vector<int8_t*> output_data;
   output_data.reserve(row_batches.size());
   for (uint i = 0; i < row_batches.size(); ++i) {
     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_data.push_back(static_cast<int8_t*>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
@@ -1362,38 +2047,63 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
          block_infos.size(),
          block_infos[0].end_col - block_infos[0].start_col + 1,
          block_infos[0].end_row - block_infos[0].start_row);
-  pretty_print(shmem_limit_per_block);
+  detail::pretty_print(shmem_limit_per_block);
   printf(" shared mem(");
-  pretty_print(fixed_width_size_per_row);
+  detail::pretty_print(fixed_width_size_per_row);
   printf("/row, %d columns, %d rows, ", num_columns, num_rows);
-  pretty_print(total_table_size);
+  detail::pretty_print(total_table_size);
   printf(" total):\n");
 #endif
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks(block_infos.size());
-  #if defined(DEBUG) || 1
-  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size));
-  #else
-  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size));
-  #endif
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS));
+  dim3 threads(256);
+
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  pretty_print(shmem_limit_per_block);
+  detail::pretty_print(shmem_limit_per_block);
   printf(" shared memory\n");
 #endif
-  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+  detail::copy_from_columns<<<blocks, threads, total_shmem, stream.value()>>>(
     num_rows,
     num_columns,
+    shmem_limit_per_block,
+    block_infos.size(),
     dev_input_data.data(),
-    dev_input_nm.data(),
     dev_col_sizes.data(),
     dev_col_starts.data(),
     dev_block_infos.data(),
     dev_row_offsets.data(),
-    reinterpret_cast<int8_t **>(dev_output_data.data()));
+    reinterpret_cast<int8_t**>(dev_output_data.data()));
+
+  auto validity_block_infos =
+    build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  dim3 validity_blocks(
+    util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+#if defined(DEBUG)
+  printf("Launching validity kernel with %d blocks, for %lu validity blocks with %d threads, ",
+         validity_blocks.x,
+         validity_block_infos.size(),
+         validity_threads.x);
+  detail::pretty_print(total_shmem);
+  printf(" shared memory\n");
+#endif
+  detail::
+    copy_validity_from_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+      num_rows,
+      num_columns,
+      shmem_limit_per_block,
+      dev_row_offsets.data(),
+      dev_output_data.data(),
+      column_starts.back(),
+      dev_validity_block_infos.data(),
+      validity_block_infos.size(),
+      dev_input_nm.data());
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
@@ -1428,11 +2138,15 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
 
   return ret;
+#else
+  CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
+  return {};
+#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource *mr)
+std::vector<std::unique_ptr<cudf::column>> old_convert_to_rows(cudf::table_view const& tbl,
+                                                               rmm::cuda_stream_view stream,
+                                                               rmm::mr::device_memory_resource* mr)
 {
   const cudf::size_type num_columns = tbl.num_columns();
 
@@ -1456,8 +2170,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     cudf::size_type num_rows = tbl.num_rows();
 
     // Get the pointers to the input columnar data ready
-    std::vector<const int8_t *> input_data;
-    std::vector<cudf::bitmask_type const *> input_nm;
+    std::vector<const int8_t*> input_data;
+    std::vector<cudf::bitmask_type const*> input_nm;
     for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
       cudf::column_view cv = tbl.column(column_number);
       input_data.emplace_back(cv.data<int8_t>());
@@ -1469,11 +2183,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
     zero->set_valid_async(true, stream);
-    static_cast<ScalarType *>(zero.get())->set_value(0, stream);
+    static_cast<ScalarType*>(zero.get())->set_value(0, stream);
 
     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
     step->set_valid_async(true, stream);
-    static_cast<ScalarType *>(step.get())
+    static_cast<ScalarType*>(step.get())
       ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
 
     std::vector<std::unique_ptr<cudf::column>> ret;
@@ -1500,11 +2214,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &input,
-                                                std::vector<cudf::data_type> const &schema,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource *mr)
+std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& input,
+                                               std::vector<cudf::data_type> const& schema,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
 {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
   // verify that the types are what we expect
   cudf::column_view child = input.child();
   cudf::type_id list_type = child.type().id();
@@ -1516,11 +2231,13 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
-  int shmem_limit_per_block;
-  CUDA_TRY(
-    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+  int total_shmem;
+  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS;
+#if defined(DEBUG) || 1
+  total_shmem -= 1024;
+#endif
+  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
 
   std::vector<cudf::size_type> column_starts;
   std::vector<cudf::size_type> column_sizes;
@@ -1529,7 +2246,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
     return std::make_tuple(schema[i], nullptr);
   });
   size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
+    iter, iter + num_columns, column_starts, column_sizes);  //, [](void *) {});
 
   size_type validity_size = num_bitmask_words(num_columns) * 4;
 
@@ -1537,8 +2254,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   // Ideally we would check that the offsets are all the same, etc. but for now
   // this is probably fine
-  CUDF_EXPECTS(row_size * num_rows == child.size(),
-               "The layout of the data appears to be off");
+  CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
   auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
 
@@ -1549,8 +2265,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<cudf::column>> output_columns;
-  std::vector<int8_t *> output_data;
-  std::vector<cudf::bitmask_type *> output_nm;
+  std::vector<int8_t*> output_data;
+  std::vector<cudf::bitmask_type*> output_nm;
   for (cudf::size_type i = 0; i < num_columns; i++) {
     auto column = cudf::make_fixed_width_column(
       schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
@@ -1568,36 +2284,97 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
-  dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
-  #if defined(DEBUG) || 1
-  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
-  #else
-  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size()));
-  #endif
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+#if defined(DEBUG)
+  dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size()));
+#else
+  dim3 threads(std::min(256, (int)child.size()));
+#endif
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  pretty_print(shmem_limit_per_block);
+  detail::pretty_print(total_shmem);
   printf(" shared memory\n");
 #endif
-  detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+  detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
     num_rows,
     num_columns,
     shmem_limit_per_block,
     input.offsets().data<size_type>(),
     dev_output_data.data(),
-    dev_output_nm.data(),
     dev_col_sizes.data(),
     dev_col_starts.data(),
     dev_block_infos.data(),
+    block_infos.size(),
     child.data<int8_t>());
 
+  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+  auto const column_stride            = [&]() {
+    if (desired_rows_and_columns > num_columns) {
+      // not many columns, group it into 8s and ship it off
+      return std::min(8, num_columns);
+    } else {
+      return util::round_down_safe(desired_rows_and_columns, 8);
+    }
+  }();
+  auto const row_stride = [&]() {
+    // we fit as much as we can, we know the column stride now, so calculate the row
+    return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
+    /*    if (desired_rows_and_columns > num_rows) {
+          return std::min(32, num_rows);
+        } else {
+          return util::round_down_safe(desired_rows_and_columns, 32);
+        }*/
+  }();
+  std::vector<detail::block_info> validity_block_infos;
+  for (int col = 0; col < num_columns; col += column_stride) {
+    for (int row = 0; row < num_rows; row += row_stride) {
+      validity_block_infos.emplace_back(
+        detail::block_info{col,
+                           row,
+                           std::min(col + column_stride - 1, num_columns - 1),
+                           std::min(row + row_stride - 1, num_rows - 1)});
+    }
+  }
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  dim3 validity_blocks(
+    util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+#if defined(DEBUG)
+  printf(
+    "Launching validity kernel with %d blocks, for %lu validity blocks, col stride %d and row "
+    "stride of %d with %d threads, ",
+    validity_blocks.x,
+    validity_block_infos.size(),
+    column_stride,
+    row_stride,
+    threads.x);
+  detail::pretty_print(total_shmem);
+  printf(" shared memory\n");
+#endif
+
+  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+  detail::
+    copy_validity_to_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+      num_rows,
+      num_columns,
+      shmem_limit_per_block,
+      input.offsets().data<size_type>(),
+      dev_output_nm.data(),
+      column_starts.back(),
+      dev_validity_block_infos.data(),
+      validity_block_infos.size(),
+      child.data<int8_t>());
+
   return std::make_unique<cudf::table>(std::move(output_columns));
+#else
+  CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
+  return {};
+#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
-                                               std::vector<cudf::data_type> const &schema,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource *mr)
+std::unique_ptr<cudf::table> old_convert_from_rows(cudf::lists_column_view const& input,
+                                                   std::vector<cudf::data_type> const& schema,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   // verify that the types are what we expect
   cudf::column_view child = input.child();
@@ -1619,12 +2396,12 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
     auto dev_column_start = make_device_uvector_async(column_start, stream);
-    auto dev_column_size = make_device_uvector_async(column_size, stream);
+    auto dev_column_size  = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
-    std::vector<int8_t *> output_data;
-    std::vector<cudf::bitmask_type *> output_nm;
+    std::vector<int8_t*> output_data;
+    std::vector<cudf::bitmask_type*> output_nm;
     for (cudf::size_type i = 0; i < num_columns; i++) {
       auto column = cudf::make_fixed_width_column(
         schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
@@ -1642,6 +2419,11 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     int shared_size =
       detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
+    //    printf("Launching (%d, %d, %d) blocks, (%d, %d, %d) threads, with %d shared size\n",
+    //    blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shared_size);
+    //    printf("pointers are column_start: %p, column_size: %p, output_data: %p, output_nm: %p\n",
+    //    dev_column_start.data(), dev_column_size.data(), dev_output_data.data(),
+    //    dev_output_nm.data());
     detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
       num_rows,
       num_columns,
@@ -1658,36 +2440,4 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
-{
-  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
-
-  //    for (uint i=0; i<input.size(); ++i) {
-  cudf::lists_column_view lcv = input[0]->view();
-  auto ret                    = convert_from_rows(lcv, schema, stream, mr);
-
-  return ret;
-  //    }
-}
-
-std::unique_ptr<cudf::table> convert_from_rows2(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
-{
-  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
-
-  //    for (uint i=0; i<input.size(); ++i) {
-  cudf::lists_column_view lcv = input[0]->view();
-  auto ret                    = convert_from_rows2(lcv, schema, stream, mr);
-
-  return ret;
-  //    }
-}
-
 }  // namespace cudf
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 818d7a89ddb..e38b37e81a6 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -34,8 +34,8 @@ TEST_F(ColumnToRowTests, Single)
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -48,8 +48,8 @@ TEST_F(ColumnToRowTests, Simple)
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -64,8 +64,8 @@ TEST_F(ColumnToRowTests, Tall)
   cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -84,8 +84,8 @@ TEST_F(ColumnToRowTests, Wide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -104,8 +104,31 @@ TEST_F(ColumnToRowTests, SingleByteWide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Big)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + 4096 * i, r + 4096 * i + 4096));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -120,9 +143,9 @@ TEST_F(RowToColumnTests, Single)
 
   auto old_rows = cudf::convert_to_rows(in);
   std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -133,11 +156,11 @@ TEST_F(RowToColumnTests, Simple)
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
   std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -150,15 +173,15 @@ TEST_F(RowToColumnTests, Tall)
   cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -170,21 +193,21 @@ TEST_F(RowToColumnTests, Wide)
   std::vector<cudf::column_view> views;
 
   for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({i}));  // rand()}));
     views.push_back(cols.back());
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::convert_to_rows(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
 
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -201,15 +224,64 @@ TEST_F(RowToColumnTests, SingleByteWide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::convert_to_rows(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, non2power)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r, r + 13));
+  views.push_back(cols.back());
+  schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Big)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + 4096 * i, r + 4096 * i + 4096));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 68f1ae93dec..1babbc6fd1a 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -14,36 +14,52 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <iostream>
+#include <iterator>
 #include <limits>
+#include <tuple>
+
+#include <cooperative_groups.h>
+#include <type_traits>
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#include <cuda/barrier>
+#endif
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-
-#include "row_conversion.hpp"
-
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
+constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
+constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
+#endif
+
+using cudf::detail::make_device_uvector_async;
 namespace cudf {
-namespace java {
 
-/**
- * Copy a simple vector to device memory asynchronously. Be sure to read
- * the data on the same stream as is used to copy it.
- */
-template <typename T>
-std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
-                                                          rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource *mr) {
-  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
-  CUDA_TRY(cudaMemcpyAsync(ret->data(), input.data(), sizeof(T) * input.size(),
-                           cudaMemcpyHostToDevice, stream.value()));
-  return ret;
+namespace detail {
+
+static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) {
+  return (offset + alignment - 1) & ~(alignment - 1);
 }
 
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
@@ -53,7 +69,6 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type *num_bytes, int8_t **output_data,
                                             cudf::bitmask_type **output_nm,
                                             const int8_t *input_data) {
-
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -122,7 +137,6 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
       cudf::size_type col_index_stride = blockDim.y;
       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-
         cudf::size_type col_size = num_bytes[col_index];
         const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
         int8_t *col_output = output_data[col_index];
@@ -208,7 +222,6 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
 
   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
-
     // Within the row group there should be 1 thread for each row.  This is a
     // requirement for launching the kernel
     cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
@@ -220,7 +233,6 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
       cudf::size_type col_index_stride = blockDim.y;
       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-
         cudf::size_type col_size = num_bytes[col_index];
         int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
         const int8_t *col_input = input_data[col_index];
@@ -304,6 +316,630 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
   }
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+struct block_info {
+  int start_col;
+  int start_row;
+  int end_col;
+  int end_row;
+  int buffer_num;
+
+  __host__ __device__ size_type get_row_size(size_type const *const col_offsets,
+                                             size_type const *const col_sizes) const {
+    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
+  }
+  __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
+
+  __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
+};
+
+// When building the columns to return, we have to be mindful of the offset limit in cudf.
+// It is 32-bit and these data columns are capable of surpassing that easily. The data should
+// not be cut off exactly at the limit though due to the validity buffers. The most efficient
+// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+// we keep track of the cut points for the validity, which we call row batches. If the row
+// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+// the outgoing data stream.
+struct row_batch {
+  size_type num_bytes;
+  size_type row_count;
+};
+
+/**
+ * @brief copy data from cudf columns into x format, which is row-based
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param input_data pointer to raw table data
+ * @param input_nm pointer to validity data
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param row_offsets offset to a specific row in the input data
+ * @param output_data pointer to output data
+ *
+ */
+__global__ void copy_from_columns(const size_type num_rows, const size_type num_columns,
+                                  const size_type shmem_used_per_block,
+                                  const size_type num_block_infos, const int8_t **input_data,
+                                  const size_type *col_sizes, const size_type *col_offsets,
+                                  const block_info *block_infos, const size_type *row_offsets,
+                                  int8_t **output_data) {
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  auto group = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&block_barrier[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  auto const blocks_remaining =
+      std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS),
+               std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
+                        (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+
+  size_t fetch;
+  size_t subset;
+  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+    // Fetch ahead up to stages_count subsets
+    for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
+
+      auto const num_fetch_cols = fetch_block.num_cols();
+      auto const num_fetch_rows = fetch_block.num_rows();
+      auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
+      auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
+      auto const starting_column_offset = col_offsets[fetch_block.start_col];
+      auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
+
+      // wait for the last use of the memory to be completed
+      if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) {
+        fetch_barrier.arrive_and_wait();
+      }
+
+      // to do the copy we need to do n column copies followed by m element copies OR
+      // we have to do m element copies followed by r row copies. When going from column
+      // to row it is much easier to copy by elements first otherwise we would need a running
+      // total of the column sizes for our block, which isn't readily available. This makes it more
+      // appealing to copy element-wise from input data into shared matching the end layout and do
+      // row-based memcopies out.
+
+      for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
+        auto const relative_col = el / num_fetch_rows;
+        auto const relative_row = el % num_fetch_rows;
+        auto const absolute_col = relative_col + fetch_block.start_col;
+        auto const absolute_row = relative_row + fetch_block.start_row;
+        auto const col_size = col_sizes[absolute_col];
+        auto const col_offset = col_offsets[absolute_col];
+        auto const relative_col_offset = col_offset - starting_column_offset;
+
+        auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
+        auto const input_src = input_data[absolute_col] + col_size * absolute_row;
+
+        // copy the main
+        cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size,
+                           fetch_barrier);
+      }
+    }
+
+    auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
+    subset_barrier.arrive_and_wait();
+
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    /*    auto const rows_in_block  = block.num_rows();
+        auto const cols_in_block  = block.num_cols();*/
+    auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
+    auto const column_offset = col_offsets[block.start_col];
+
+    // copy entire rows to final dest
+    for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
+         absolute_row += blockDim.x) {
+      auto const relative_row = absolute_row - block.start_row;
+      auto const output_dest =
+          output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
+      auto const shared_offset = block_row_size * relative_row;
+      cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
+                         subset_barrier);
+    }
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+    block_barrier[i].arrive_and_wait();
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param offsets
+ * @param output_data pointer to output data, partitioned by data size
+ * @param validity_offsets offset into input data row for validity data
+ * @param block_infos information about the blocks of work
+ * @param num_block_infos number of infos in blocks array
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_validity_from_columns(
+    const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
+    const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
+    const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+      shared_data, shared_data + shmem_used_per_block / 2};
+
+  // per conversation with DaveB
+  // each thread of warp reads a single int32 of validity - so we read 128 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const blocks_remaining =
+      std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+               (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+      shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&shared_block_barriers[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+    if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+      shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
+          .arrive_and_wait();
+    }
+    int8_t *this_shared_block = shared_blocks[validity_block % 2];
+    auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+
+    auto const num_block_cols = block.num_cols();
+    auto const num_block_rows = block.num_rows();
+
+    auto const num_sections_x = (num_block_cols + 31) / 32;
+    auto const num_sections_y = (num_block_rows + 7) / 8;
+    auto const validity_data_row_length =
+        align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
+    auto const total_sections = num_sections_x * num_sections_y;
+
+    int const warp_id = threadIdx.x / detail::warp_size;
+    int const lane_id = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+
+    // the block is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_block) {
+      // convert to rows and cols
+      auto const section_x = my_section_idx / num_sections_x;
+      auto const section_y = my_section_idx % num_sections_x;
+
+      auto const relative_col = section_x * 32 + lane_id;
+      auto const relative_row = section_y * 8;
+      auto const absolute_col = relative_col + block.start_col;
+      auto const absolute_row = relative_row + block.start_row;
+      auto const cols_left = num_columns - absolute_col;
+
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
+
+      if (absolute_col < num_columns) {
+        auto my_byte =
+            input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
+
+        // every thread that is participating in the warp has a byte, but it's column-based
+        // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make
+        // the bytes we actually write.
+        for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+          // lead thread in each warp writes data
+          auto const validity_write_offset =
+              validity_data_row_length * (relative_row + i) + relative_col / 8;
+          if (threadIdx.x % detail::warp_size == 0) {
+            if (cols_left <= 8) {
+              // write byte
+              this_shared_block[validity_write_offset] = validity_data & 0xFF;
+            } else if (cols_left <= 16) {
+              // write int16
+              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data & 0xFFFF;
+            } else if (cols_left <= 24) {
+              // write int16 and then int8
+              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data & 0xFFFF;
+              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+            } else {
+              // write int32
+              *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data;
+            }
+          }
+        }
+      }
+    }
+
+    // make sure entire block has finished copy
+    group.sync();
+
+    // now async memcpy the shared memory out to the final destination
+    for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
+      auto const relative_row = row - block.start_row;
+      auto const output_ptr =
+          output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
+      auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+      cuda::memcpy_async(
+          output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes,
+          shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+    }
+  }
+
+  // wait for last blocks of data to arrive
+  for (int validity_block = 0;
+       validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+       ++validity_block) {
+    shared_block_barriers[validity_block].arrive_and_wait();
+  }
+}
+
+static __device__ std::tuple<size_type, size_type>
+get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) {
+  auto const col_size_bytes = num_cols * col_size_size;
+  auto const col_offset_bytes = num_cols * col_offset_size;
+
+  return {col_size_bytes, col_offset_bytes};
+}
+
+/**
+ * @brief ensure `read_ahead` buffer blocks are fetched
+ *
+ * @param fetch_index internal state passed into the function
+ * @param processing_index index where processing is occuring
+ * @param read_ahead_count how many blocks to read ahead
+ * @param max_resident_blocks how many blocks can be loaded at once
+ * @param total_blocks total number of blocks overall
+ * @param block_infos pointer to the block infos
+ * @param col_sizes pointer to column size information
+ * @param col_offsets pointer to the table's column offsets
+ * @param row_offsets pointer to offsets for each row in the table
+ * @param input_data pointer to the input data
+ * @param shared pointer to shared memory
+ * @param group thread group participating in the fetch
+ * @param block_barrier barriers used for each block
+ * @return
+ */
+static __device__ void
+fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_index,
+                               int const read_ahead_count, int const max_resident_blocks,
+                               int const total_blocks, block_info const *const block_infos,
+                               size_type const *const col_sizes, size_type const *const col_offsets,
+                               size_type const *const row_offsets, int8_t const *const input_data,
+                               int8_t *shared[], cooperative_groups::thread_block const group,
+                               cuda::barrier<cuda::thread_scope_block> *block_barrier) {
+  for (; fetch_index < static_cast<size_t>(total_blocks) &&
+         fetch_index < (processing_index + read_ahead_count);
+       ++fetch_index) {
+    auto const fetch_block =
+        block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
+    auto const fetch_block_start_row = fetch_block.start_row;
+    auto const fetch_block_end_row = fetch_block.end_row;
+    auto const starting_col_offset = col_offsets[fetch_block.start_col];
+
+    auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
+    auto const num_fetch_cols = fetch_block.num_cols();
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+        sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
+    auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+
+    // if we have fetched all buffers, we need to wait for processing
+    // to complete on them before we can use them again
+    if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
+      fetch_barrier.arrive_and_wait();
+    }
+
+    auto shared_row_offset = 0;
+    // copy the data for column sizes
+    cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset],
+                       &col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
+    shared_row_offset += col_size_bytes;
+    // copy the data for column offsets
+    cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset],
+                       &col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
+    shared_row_offset += col_offset_bytes;
+    shared_row_offset = align_offset(shared_row_offset, 8);
+
+    for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
+         row <= fetch_block_end_row; row += blockDim.x) {
+      auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+      // copy the main
+      cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
+                         &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size,
+                         fetch_barrier);
+    }
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param row_offsets
+ * @param output_data
+ * @param output_nm
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_to_columns(const size_type num_rows, const size_type num_columns,
+                                const size_type shmem_used_per_block, const size_type *row_offsets,
+                                int8_t **output_data, const size_type *_col_sizes,
+                                const size_type *_col_offsets, const block_info *block_infos,
+                                const size_type num_block_infos, const int8_t *input_data) {
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
+  // to shared memory for each of the blocks that we work on
+
+  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  auto group = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[stages_count];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < stages_count; ++i) {
+      init(&block_barrier[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
+                                   (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
+
+  auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)),
+                               col_offset_size = sizeof(decltype(*_col_offsets))](
+                                  int const num_cols,
+                                  int const num_rows) -> std::tuple<size_type, size_type> {
+    auto const col_size_bytes = num_cols * col_size_size;
+    auto const col_offset_bytes = num_cols * col_offset_size;
+
+    return {col_size_bytes, col_offset_bytes};
+  };
+
+  size_t fetch;
+  size_t subset;
+  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+    // Fetch ahead up to stages_count subsets
+    fetch_blocks_for_row_to_column(fetch, subset, stages_count, stages_count, blocks_remaining,
+                                   block_infos, _col_sizes, _col_offsets, row_offsets, input_data,
+                                   shared, group, block_barrier);
+
+    auto &subset_barrier = block_barrier[subset % stages_count];
+    // ensure our data is ready
+    subset_barrier.arrive_and_wait();
+
+    auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+
+    auto const rows_in_block = block.num_rows();
+    auto const cols_in_block = block.num_cols();
+
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block);
+    // auto shared_row_offsets = shared[subset];
+    auto shared_col_sizes = reinterpret_cast<size_type *>(shared[subset % stages_count]);
+    auto shared_col_offsets =
+        reinterpret_cast<size_type *>(&shared[subset % stages_count][col_size_bytes]);
+
+    auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
+
+    auto block_row_size = block.get_row_size(_col_offsets, _col_sizes);
+
+    // now we copy from shared memory to final destination.
+    // the data is laid out in rows in shared memory, so the reads
+    // for a column will be "vertical". Because of this and the different
+    // sizes for each column, this portion is handled on row/column basis.
+    // to prevent each thread working on a single row and also to ensure
+    // that all threads can do work in the case of more threads than rows,
+    // we do a global index instead of a double for loop with col/row.
+    for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+      auto const relative_col = index % cols_in_block;
+      auto const relative_row = index / cols_in_block;
+      auto const absolute_col = relative_col + block.start_col;
+      auto const absolute_row = relative_row + block.start_row;
+
+      auto const shared_memory_row_offset = block_row_size * relative_row;
+      auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
+                                        shared_memory_row_offset + shared_row_offset;
+      auto const column_size = shared_col_sizes[relative_col];
+
+      int8_t *shmem_src = &shared[subset % stages_count][shared_memory_offset];
+      int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
+
+      cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
+    }
+    group.sync();
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+    block_barrier[i].arrive_and_wait();
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param offsets
+ * @param output_nm
+ * @param validity_offsets offset into input data row for validity data
+ * @param block_infos information about the blocks of work
+ * @param num_block_infos number of infos in blocks array
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_validity_to_columns(
+    const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
+    const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
+    const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+      shared_data, shared_data + shmem_used_per_block / 2};
+
+  // per conversation with DaveB
+  // each thread of warp reads a single byte of validity - so we read 32 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const blocks_remaining =
+      std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+               (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+      shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&shared_block_barriers[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+    auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+    if (validity_block != validity_index) {
+      shared_block_barriers[validity_index].arrive_and_wait();
+    }
+    int8_t *this_shared_block = shared_blocks[validity_block % 2];
+    auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+    auto const block_start_col = block.start_col;
+    auto const block_start_row = block.start_row;
+
+    auto const num_block_cols = block.num_cols();
+    auto const num_block_rows = block.num_rows();
+
+    auto const num_sections_x = (num_block_cols + 7) / 8;
+    auto const num_sections_y = (num_block_rows + 31) / 32;
+    auto const validity_data_col_length = align_offset(num_sections_y, 4);
+    auto const total_sections = num_sections_x * num_sections_y;
+
+    int const warp_id = threadIdx.x / detail::warp_size;
+    int const lane_id = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+
+    // the block is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_block) {
+      // convert to rows and cols
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
+
+      auto const relative_col = section_x * 8;
+      auto const relative_row = section_y * 32 + lane_id;
+      auto const absolute_col = relative_col + block_start_col;
+      auto const absolute_row = relative_row + block_start_row;
+      auto const rows_left = num_rows - absolute_row;
+
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
+
+      if (absolute_row < num_rows) {
+        auto const my_byte =
+            input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
+
+        // so every thread that is participating in the warp has a byte, but it's row-based
+        // data and we need it in column-based. So we shiffle the bits around to make
+        // the bytes we actually write.
+        for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
+             ++i, byte_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+          // lead thread in each warp writes data
+          if (threadIdx.x % detail::warp_size == 0) {
+            auto const validity_write_offset =
+                validity_data_col_length * (relative_col + i) + relative_row / 8;
+
+            if (rows_left <= 8) {
+              // write byte
+              this_shared_block[validity_write_offset] = validity_data & 0xFF;
+            } else if (rows_left <= 16) {
+              // write int16
+              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data & 0xFFFF;
+            } else if (rows_left <= 24) {
+              // write int16 and then int8
+              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data & 0xFFFF;
+              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+            } else {
+              // write int32
+              *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data;
+            }
+          }
+        }
+      }
+    }
+
+    // make sure entire block has finished copy
+    group.sync();
+
+    // now async memcpy the shared
+    for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
+      auto const relative_col = col - block.start_col;
+
+      cuda::memcpy_async(
+          output_nm[col] + word_index(block_start_row),
+          &this_shared_block[validity_data_col_length * relative_col],
+          util::div_rounding_up_unsafe(num_block_rows, 8),
+          shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+    }
+  }
+
+  // wait for last blocks of data to arrive
+  auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ?
+                                      NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED :
+                                      blocks_remaining;
+  for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
+    shared_block_barriers[validity_block].arrive_and_wait();
+  }
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
  * @param [in] num_columns the number of columns being copied.
@@ -317,7 +953,6 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
                                         const cudf::size_type num_rows,
                                         const cudf::size_type size_per_row, dim3 &blocks,
                                         dim3 &threads) {
-
   // We have found speed degrades when a thread handles more than 4 columns.
   // Each block is 2 dimensional. The y dimension indicates the columns.
   // We limit this to 32 threads in the y dimension so we can still
@@ -327,10 +962,9 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
   // in the x dimension because we use atomic operations at the block
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
-  int y_block_size = (num_columns + 3) / 4;
-  if (y_block_size > 32) {
+  int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4);
+  if (y_block_size > 32)
     y_block_size = 32;
-  }
   int x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
   // If someone configures the GPU to only have 16 KB this might not work.
@@ -373,15 +1007,15 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
  * going from start row and containing the next num_rows.  Most of the parameters passed
  * into this function are common between runs and should be calculated once.
  */
-static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
-    const cudf::size_type start_row, const cudf::size_type num_rows,
-    const cudf::size_type num_columns, const cudf::size_type size_per_row,
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
-    std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
-    std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
-    const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr) {
+static std::unique_ptr<cudf::column>
+fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows,
+                            const cudf::size_type num_columns, const cudf::size_type size_per_row,
+                            rmm::device_uvector<cudf::size_type> &column_start,
+                            rmm::device_uvector<cudf::size_type> &column_size,
+                            rmm::device_uvector<const int8_t *> &input_data,
+                            rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
+                            const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row,
+                            rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   int64_t total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
   CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
@@ -397,30 +1031,23 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   dim3 blocks;
   dim3 threads;
   int shared_size =
-      calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
   copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-      start_row, num_rows, num_columns, size_per_row, column_start->data(), column_size->data(),
-      input_data->data(), input_nm->data(), data->mutable_view().data<int8_t>());
+      start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
+      input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                                 rmm::device_buffer{}, stream, mr);
+                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
 }
 
 static cudf::data_type get_data_type(const cudf::column_view &v) {
   return v.type();
 }
 
-static bool is_fixed_width(const cudf::data_type &t) {
-  return cudf::is_fixed_width(t);
-}
-
-static inline int32_t align_offset(int32_t offset, std::size_t alignment) {
-  return (offset + alignment - 1) & ~(alignment - 1);
-}
-
 static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
-  return std::all_of(schema.begin(), schema.end(), cudf::java::is_fixed_width);
+  return std::all_of(schema.begin(), schema.end(),
+                     [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
 }
 
 /**
@@ -449,30 +1076,443 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  int32_t validity_bytes_needed =
+      (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
   return align_offset(at_offset, 8); // 8 bytes (64 bits)
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+template <typename iterator>
+static size_type compute_column_information(iterator begin, iterator end,
+                                            std::vector<size_type> &column_starts,
+                                            std::vector<size_type> &column_sizes) //,
+// std::function<void(T)> nested_type_cb)
+{
+  size_type fixed_width_size_per_row = 0;
+  for (auto cv = begin; cv != end; ++cv) {
+    auto col_type = std::get<0>(*cv);
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed = col_size; // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+
+  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  column_starts.push_back(validity_offset);
+
+  return fixed_width_size_per_row;
+}
+
+std::vector<detail::block_info>
+build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
+                           size_type const &shmem_limit_per_block,
+                           std::vector<row_batch> const &row_batches) {
+  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+  auto const column_stride = align_offset(
+      [&]() {
+        if (desired_rows_and_columns > num_columns) {
+          // not many columns, group it into 8s and ship it off
+          return std::min(8, num_columns);
+        } else {
+          return util::round_down_safe(desired_rows_and_columns, 8);
+        }
+      }(),
+      8);
+  // we fit as much as we can given the column stride
+  auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride);
+
+  std::vector<detail::block_info> validity_block_infos;
+  for (int col = 0; col < num_columns; col += column_stride) {
+    int current_window_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+    int row = 0;
+    while (row < num_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(row_stride, rows_left_in_batch);
+
+      validity_block_infos.emplace_back(detail::block_info{
+          col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
+      row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  }
+
+  return validity_block_infos;
+}
+
+std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
+                                          std::vector<size_type> const &column_starts,
+                                          std::vector<row_batch> const &row_batches,
+                                          size_type const total_number_of_rows,
+                                          size_type const &shmem_limit_per_block) {
+  std::vector<block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_width = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
+                          int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+    int i = 0;
+    while (i < total_number_of_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(detail::block_info{
+          start_col, current_window_start_row, end_col,
+          std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
+          current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+  // would be memory cache line sized access, but since other blocks will read/write the edges this
+  // may not turn out to be overly important. For now, we will attempt to build a square window as
+  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
+  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
+  // bytes, not rows or columns.
+  size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
+  int const window_height =
+      std::clamp(util::round_up_safe<int>(
+                     optimal_square_len <= (size_type)column_sizes.size() ?
+                         std::min(optimal_square_len / column_sizes[0], total_number_of_rows) :
+                         row_batches[0].row_count / 2,
+                     32),
+                 1, row_batches[0].row_count);
+
+  auto calc_admin_data_size = [](int num_cols) -> size_type {
+    // admin data is the column sizes and column start information.
+    // this is copied to shared memory as well and needs to be accounted for
+    // in the window calculation.
+    return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
+  };
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed = col_size; // They are the same for fixed width types
+    auto row_size_aligned = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col = row_size_aligned + col_size;
+    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
+
+    if (row_size_with_end_pad * window_height +
+            calc_admin_data_size(col - current_window_start_col) >
+        shmem_limit_per_block) {
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size =
+          detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+      row_size += col_size; // alignment required for shared memory window boundary to match
+                            // alignment of output row
+      current_window_start_col = col;
+      current_window_width = 0;
+    } else {
+      row_size = row_size_with_this_col;
+      current_window_width++;
+    }
+  }
+
+  // build last set of blocks
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
+  }
+
+  return block_infos;
+}
+
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+} // namespace detail
+
 std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
                                                            rmm::cuda_stream_view stream,
                                                            rmm::mr::device_memory_resource *mr) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows = tbl.num_rows();
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int total_shmem;
+  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  // TODO: kernels fail to launch if we use all the available shared memory.
+  total_shmem -= 1024;
+
+  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+
+  // break up the work into blocks, which are a starting and ending row/col #.
+  // this window size is calculated based on the shared memory size available
+  // we want a single block to fill up the entire shared memory space available
+  // for the transpose-like conversion.
+
+  // There are two different processes going on here. The GPU conversion of the data
+  // and the writing of the data into the list of byte columns that are a maximum of
+  // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
+  // this limitation because the column must own the data inside and as a result it must be
+  // a distinct allocation for that column. Copying the data into these final buffers would
+  // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
+  // The windows are broken at the boundaries of specific rows based on the row sizes up
+  // to that point. These are row batches and they are decided first before building the
+  // windows so the windows can be properly cut around them.
+
+  // Get the pointers to the input columnar data ready
+  std::vector<int8_t const *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  input_data.reserve(num_columns);
+  input_nm.reserve(num_columns);
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
 
+  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+  auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
+
+  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
+  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
+  std::vector<size_type> column_sizes;  // byte size of each column
+  std::vector<size_type> column_starts; // offset of column inside a row including alignment
+  std::vector<column_view>
+      variable_width_columns; // list of the variable width columns in the table
+  row_sizes.reserve(num_rows);
+  row_offsets.reserve(num_rows);
+  column_sizes.reserve(num_columns);
+  column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
+
+  auto iter =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                      [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+                                        return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+                                      });
+
+  size_type fixed_width_size_per_row =
+      detail::compute_column_information(iter, iter + num_columns, column_starts,
+                                         column_sizes); //,
+  //    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
+  /*  size_type fixed_width_size_per_row = 0;
+    for (int col = 0; col < num_columns; ++col) {
+      auto cv          = tbl.column(col);
+      auto col_type    = cv.type();
+      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+      if (nested_type) { variable_width_columns.push_back(cv); }
+
+      // a list or string column will write a single uint64
+      // of data here for offset/length
+      auto col_size = nested_type ? 8 : size_of(col_type);
+
+      // align size for this type
+      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+      column_starts.push_back(fixed_width_size_per_row);
+      column_sizes.push_back(col_size);
+      fixed_width_size_per_row += col_size;
+    }*/
+
+  auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+
+  std::vector<detail::row_batch> row_batches;
+
+  auto calculate_variable_width_row_data_size = [](int const row) {
+    // each level of variable-width data will add an offset/length
+    // uint64 of data. The first of which is inside the fixed-width
+    // data itself and needs to be aligned based on what is around
+    // that data. This is handled above with the fixed-width calculations
+    // for that reason. We may still need to add more of these offset/length
+    // combinations if the nesting is deeper than one level as these
+    // will be included in the variable-width data blob at the end of the
+    // row.
+    return 0;
+    /*      auto c = variable_width_columns[col];
+            while (true) {
+              auto col_offsets   = c.child(0).data<size_type>();
+              auto col_data_size = size_of(c.child(1).type());
+              std::size_t alignment_needed  = col_data_size;
+
+            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+            if (c.num_children() == 0) {
+              break;
+            }
+            c = c.child(1);
+          }
+    */
+  };
+
+  uint64_t row_batch_size = 0;
+  uint64_t total_table_size = 0;
+  size_type row_batch_rows = 0;
+  uint64_t row_offset = 0;
+
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
+  // calculate the size of each row's variable-width data and validity as well.
+  auto validity_size = num_bitmask_words(num_columns) * 4;
+  for (int row = 0; row < num_rows; ++row) {
+    auto aligned_row_batch_size =
+        detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
+    row_sizes[row] = fixed_width_size_per_row;
+    // validity is byte aligned
+    row_sizes[row] += validity_size;
+    // variable width data is 8-byte aligned
+    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
+                     calculate_variable_width_row_data_size(row); // rows are 8 byte aligned
+
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
+        (uint64_t)std::numeric_limits<size_type>::max()) {
+      // a new batch starts at the last 32-row boundary
+      row_batches.push_back(
+          detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size = 0;
+      row_batch_rows = row_batch_rows & 31;
+      row_offset = 0;
+      aligned_row_batch_size = 0;
+    }
+    row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+    row_offsets.push_back(row_offset);
+    row_batch_size = aligned_row_batch_size + row_sizes[row];
+    row_offset += row_sizes[row];
+    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+    total_table_size += row_sizes[row];
+    row_batch_rows++;
+  }
+  if (row_batch_size > 0) {
+    row_batches.push_back(
+        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+  }
+
+  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
+
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i = 0; i < row_batches.size(); ++i) {
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
+  }
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+
+  std::vector<detail::block_info> block_infos =
+      build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+
+  // blast through the entire table and convert it
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS));
+  dim3 threads(256);
+
+  detail::copy_from_columns<<<blocks, threads, total_shmem, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
+      dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
+      reinterpret_cast<int8_t **>(dev_output_data.data()));
+
+  auto validity_block_infos =
+      build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  dim3 validity_blocks(
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+  detail::copy_validity_from_columns<<<validity_blocks, validity_threads, total_shmem,
+                                       stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
+      column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
+      dev_input_nm.data());
+
+  // split up the output buffer into multiple buffers based on row batch sizes
+  // and create list of byte columns
+  int offset_offset = 0;
+  std::vector<std::unique_ptr<cudf::column>> ret;
+  for (uint i = 0; i < row_batches.size(); ++i) {
+    // compute offsets for this row batch
+    std::vector<size_type> offset_vals;
+    offset_vals.reserve(row_batches[i].row_count + 1);
+    size_type cur_offset = 0;
+    offset_vals.push_back(cur_offset);
+    for (int row = 0; row < row_batches[i].row_count; ++row) {
+      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
+      offset_vals.push_back(cur_offset);
+    }
+    offset_offset += row_batches[i].row_count;
+
+    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
+    auto offsets = std::make_unique<column>(data_type{type_id::INT32},
+                                            (size_type)offset_vals.size(), dev_offsets.release());
+
+    auto data = std::make_unique<column>(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes,
+                                         std::move(output_buffers[i]));
+
+    ret.push_back(
+        cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0,
+                                rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
+  }
+
+  return ret;
+#else
+  CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
+  return {};
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+}
+
+std::vector<std::unique_ptr<cudf::column>>
+old_convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource *mr) {
   const cudf::size_type num_columns = tbl.num_columns();
 
   std::vector<cudf::data_type> schema;
   schema.resize(num_columns);
-  std::transform(tbl.begin(), tbl.end(), schema.begin(), cudf::java::get_data_type);
+  std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
 
-  if (are_all_fixed_width(schema)) {
+  if (detail::are_all_fixed_width(schema)) {
     std::vector<cudf::size_type> column_start;
     std::vector<cudf::size_type> column_size;
 
-    int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
+    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
+    auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
 
     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
@@ -489,8 +1529,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
-    auto dev_input_data = copy_to_dev_async(input_data, stream, mr);
-    auto dev_input_nm = copy_to_dev_async(input_nm, stream, mr);
+    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+    auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
@@ -506,7 +1546,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
       cudf::size_type row_count = num_rows - row_start;
       row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-      ret.emplace_back(fixed_width_convert_to_rows(
+      ret.emplace_back(detail::fixed_width_convert_to_rows(
           row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
           dev_input_data, dev_input_nm, *zero, *step, stream, mr));
     }
@@ -521,7 +1561,129 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
                                                std::vector<cudf::data_type> const &schema,
                                                rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource *mr) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+  // verify that the types are what we expect
+  cudf::column_view child = input.child();
+  cudf::type_id list_type = child.type().id();
+  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  cudf::size_type num_columns = schema.size();
+  cudf::size_type num_rows = input.parent().size();
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int total_shmem;
+  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  // TODO: unable to launch a kernel with all shared used
+  total_shmem -= 1024;
+  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+
+  std::vector<cudf::size_type> column_starts;
+  std::vector<cudf::size_type> column_sizes;
+
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
+    return std::make_tuple(schema[i], nullptr);
+  });
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+      iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {});
+
+  size_type validity_size = num_bitmask_words(num_columns) * 4;
+
+  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
+
+  // Ideally we would check that the offsets are all the same, etc. but for now
+  // this is probably fine
+  CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+  auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+
+  // build the row_batches from the passed in list column
+  std::vector<detail::row_batch> row_batches;
+
+  row_batches.push_back(detail::row_batch{child.size(), num_rows});
+
+  // Allocate the columns we are going to write into
+  std::vector<std::unique_ptr<cudf::column>> output_columns;
+  std::vector<int8_t *> output_data;
+  std::vector<cudf::bitmask_type *> output_nm;
+  for (cudf::size_type i = 0; i < num_columns; i++) {
+    auto column = cudf::make_fixed_width_column(schema[i], num_rows,
+                                                cudf::mask_state::UNINITIALIZED, stream, mr);
+    auto mut = column->mutable_view();
+    output_data.emplace_back(mut.data<int8_t>());
+    output_nm.emplace_back(mut.null_mask());
+    output_columns.emplace_back(std::move(column));
+  }
+
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+  auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
+
+  std::vector<detail::block_info> block_infos =
+      build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+#if defined(DEBUG)
+  dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size()));
+#else
+  dim3 threads(std::min(256, (int)child.size()));
+#endif
+  detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
+      block_infos.size(), child.data<int8_t>());
+
+  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+  auto const column_stride = [&]() {
+    if (desired_rows_and_columns > num_columns) {
+      // not many columns, group it into 8s and ship it off
+      return std::min(8, num_columns);
+    } else {
+      return util::round_down_safe(desired_rows_and_columns, 8);
+    }
+  }();
+  auto const row_stride = [&]() {
+    // we fit as much as we can, we know the column stride now, so calculate the row
+    return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
+    /*    if (desired_rows_and_columns > num_rows) {
+          return std::min(32, num_rows);
+        } else {
+          return util::round_down_safe(desired_rows_and_columns, 32);
+        }*/
+  }();
+  std::vector<detail::block_info> validity_block_infos;
+  for (int col = 0; col < num_columns; col += column_stride) {
+    for (int row = 0; row < num_rows; row += row_stride) {
+      validity_block_infos.emplace_back(
+          detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
+                             std::min(row + row_stride - 1, num_rows - 1)});
+    }
+  }
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  dim3 validity_blocks(
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+
+  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+  detail::
+      copy_validity_to_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+          num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+          dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
+          validity_block_infos.size(), child.data<int8_t>());
+
+  return std::make_unique<cudf::table>(std::move(output_columns));
+#else
+  CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
+  return {};
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+}
 
+std::unique_ptr<cudf::table> old_convert_from_rows(cudf::lists_column_view const &input,
+                                                   std::vector<cudf::data_type> const &schema,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource *mr) {
   // verify that the types are what we expect
   cudf::column_view child = input.child();
   cudf::type_id list_type = child.type().id();
@@ -530,19 +1692,19 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 
   cudf::size_type num_columns = schema.size();
 
-  if (are_all_fixed_width(schema)) {
+  if (detail::are_all_fixed_width(schema)) {
     std::vector<cudf::size_type> column_start;
     std::vector<cudf::size_type> column_size;
 
     cudf::size_type num_rows = input.parent().size();
-    int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
+    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
 
     // Ideally we would check that the offsets are all the same, etc. but for now
     // this is probably fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream);
+    auto dev_column_size = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -557,17 +1719,17 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       output_columns.emplace_back(std::move(column));
     }
 
-    auto dev_output_data = copy_to_dev_async(output_data, stream, mr);
-    auto dev_output_nm = copy_to_dev_async(output_nm, stream, mr);
+    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+    auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
     int shared_size =
-        calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+        detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
-    copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-        num_rows, num_columns, size_per_row, dev_column_start->data(), dev_column_size->data(),
-        dev_output_data->data(), dev_output_nm->data(), child.data<int8_t>());
+    detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+        num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
+        dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
 
     return std::make_unique<cudf::table>(std::move(output_columns));
   } else {
@@ -575,5 +1737,4 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   }
 }
 
-} // namespace java
 } // namespace cudf
diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
index 17abde8df19..517202f3892 100644
--- a/java/src/main/native/src/row_conversion.hpp
+++ b/java/src/main/native/src/row_conversion.hpp
@@ -25,12 +25,24 @@
 namespace cudf {
 namespace java {
 
+std::vector<std::unique_ptr<cudf::column>>
+old_convert_to_rows(cudf::table_view const &tbl,
+                    // TODO need something for validity
+                    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+                    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 std::vector<std::unique_ptr<cudf::column>>
 convert_to_rows(cudf::table_view const &tbl,
                 // TODO need something for validity
                 rmm::cuda_stream_view stream = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::table>
+old_convert_from_rows(cudf::lists_column_view const &input,
+                      std::vector<cudf::data_type> const &schema,
+                      rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+                      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 std::unique_ptr<cudf::table>
 convert_from_rows(cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
                   rmm::cuda_stream_view stream = rmm::cuda_stream_default,

From 2a57ce67cc4fa7bc7ae436756f2ab7a5d0eb2cab Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 21 Sep 2021 21:39:00 +0000
Subject: [PATCH 16/80] fixing validity alignment bugs

---
 cpp/src/row_conversion/row_conversion.cu    | 144 +++++++++----
 cpp/tests/row_conversion/row_conversion.cpp | 226 +++++++++++++++++++-
 java/src/main/native/src/row_conversion.cu  |  22 +-
 3 files changed, 333 insertions(+), 59 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 42c40e0542d..0409a65b630 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -493,7 +493,7 @@ __global__ void copy_from_columns(const size_type num_rows,
                  input_src,
                  col_size);
 
-        // copy the main
+        // copy the element to global memory
         cuda::memcpy_async(
           &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier);
       }
@@ -568,7 +568,11 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-  constexpr bool print_debug = false;  //(threadIdx.x==0 || threadIdx.x == 32) && blockIdx.x == 0;
+    int8_t* output_check_addr = nullptr;
+    int8_t* output_block_start = nullptr;
+    size_type output_block_size = 0;
+
+  bool print_debug = false; //threadIdx.x==0 && blockIdx.x == 0;
   //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
   if (print_debug) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -659,12 +663,14 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
 
     if (print_debug)
       printf(
-        "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side "
+        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, warp size "
         "%d\n",
         threadIdx.x,
         blockIdx.x,
         warp_id,
         total_sections,
+        num_sections_x,
+        num_sections_y,
         warps_per_block,
         blockDim.x,
         detail::warp_size);
@@ -672,10 +678,10 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
          my_section_idx += warps_per_block) {
       // convert to rows and cols
-      auto const section_x = my_section_idx / num_sections_x;
-      auto const section_y = my_section_idx % num_sections_x;
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
 
-      if (print_debug) printf("working on section %d of %d...\n", section_x, num_sections_x);
+      if (print_debug) printf("working on section %d,%d - %d of %d...\n", section_x, section_y, my_section_idx, total_sections);
       auto const relative_col = section_x * 32 + lane_id;
       auto const relative_row = section_y * 8;
       auto const absolute_col = relative_col + block.start_col;
@@ -722,7 +728,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             absolute_col);
 
         // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make
+        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make
         // the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
@@ -744,23 +750,23 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             if (cols_left <= 8) {
               // write byte
               if (print_debug)
-                printf("writing single byte to shared offset 0x%x which is %p...\n",
-                       validity_write_offset,
+                printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n",
+                threadIdx.x, blockIdx.x, validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
             } else if (cols_left <= 16) {
               // write int16
               if (print_debug)
-                printf("writing two bytes to shared offset 0x%x which is %p...\n",
-                       validity_write_offset,
+                printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n",
+                threadIdx.x, blockIdx.x, validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data & 0xFFFF;
             } else if (cols_left <= 24) {
               // write int16 and then int8
               if (print_debug)
-                printf("writing three bytes to shared offset 0x%x which is %p...\n",
-                       validity_write_offset,
+                printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n",
+                threadIdx.x, blockIdx.x, validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data & 0xFFFF;
@@ -768,8 +774,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             } else {
               // write int32
               if (print_debug)
-                printf("writing 4 bytes to shared offset 0x%x which is %p...\n",
-                       validity_write_offset,
+                printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n",
+                threadIdx.x, blockIdx.x, validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data;
@@ -816,6 +822,18 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
       auto const output_ptr =
         output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+
+/*      if (num_rows >= 5006) {
+        auto const row5006_col_65 = output_data[block.buffer_num] + row_offsets[5006] + validity_offset + 65 / 8;
+        if (output_ptr >= row5006_col_65 && output_ptr <= row5006_col_65 + 4) {
+          printf("%d %d - writing bytes from %p(0x%x)-%p to %p-%p that overlap global %p(0x%x), which is row 5006, col 65!\n", threadIdx.x, blockIdx.x, &this_shared_block[validity_data_row_length * relative_row], this_shared_block[validity_data_row_length * relative_row], &this_shared_block[validity_data_row_length * relative_row + num_bytes], output_ptr, output_ptr + num_bytes, row5006_col_65, *row5006_col_65);
+          printf("%d %d - block information\n%d,%d -> %d,%d\n%d columns, %d rows\n", threadIdx.x, blockIdx.x, block.start_col, block.start_row, block.end_col, block.end_row, block.num_cols(), block.num_rows());
+          output_check_addr = row5006_col_65;
+          output_block_start = output_ptr;
+          output_block_size = num_bytes;
+        }
+      }*/
+
       cuda::memcpy_async(
         output_ptr,
         &this_shared_block[validity_data_row_length * relative_row],
@@ -851,6 +869,17 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
        ++validity_block) {
     shared_block_barriers[validity_block].arrive_and_wait();
   }
+  if (output_check_addr != nullptr) {
+    printf("output check after write to %p - 0x%x\n", output_check_addr, *output_check_addr);
+    for (int i=0; i<output_block_size; ++i) {
+        if (i % 16 == 0) {
+          printf("\n%p - ", &output_block_start[i]);
+        }
+        printf("0x%x ", output_block_start[i]);
+    }
+    printf("\n");
+  }
+
 }
 
 static __device__ std::tuple<size_type, size_type> get_admin_data_sizes(size_t col_size_size,
@@ -901,12 +930,12 @@ static __device__ void fetch_blocks_for_row_to_column(
   for (; fetch_index < static_cast<size_t>(total_blocks) &&
          fetch_index < (processing_index + read_ahead_count);
        ++fetch_index) {
-    if (debug_print)
-      printf("fetching block %lu of %d\n",
-             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
-             total_blocks);
     auto const fetch_block =
       block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
+      if (debug_print)
+      printf("fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending offset %p\n",
+             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
+             total_blocks, fetch_block.start_col, fetch_block.end_col, &col_offsets[fetch_block.start_col], &col_offsets[fetch_block.end_col]);
     auto const fetch_block_start_row = fetch_block.start_row;
     auto const fetch_block_end_row   = fetch_block.end_row;
     auto const starting_col_offset   = col_offsets[fetch_block.start_col];
@@ -948,7 +977,7 @@ static __device__ void fetch_blocks_for_row_to_column(
              &shared[fetch_index % max_resident_blocks][shared_row_offset],
              &col_offsets[fetch_block.start_col],
              col_offset_bytes);
-    cuda::memcpy_async(group,
+   cuda::memcpy_async(group,
                        &shared[fetch_index % max_resident_blocks][shared_row_offset],
                        &col_offsets[fetch_block.start_col],
                        col_offset_bytes,
@@ -983,7 +1012,7 @@ static __device__ void fetch_blocks_for_row_to_column(
                fetch_index % max_resident_blocks,
                &shared[fetch_index % max_resident_blocks][shared_offset],
                &input_data[row_offsets[row] + starting_col_offset]);
-      // copy the main
+       // copy the main
       cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
                          &input_data[row_offsets[row] + starting_col_offset],
                          fetch_block_row_size,
@@ -1029,7 +1058,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
   // to shared memory for each of the blocks that we work on
 
-  /*constexpr*/ bool debug_print  = false;  // threadIdx.x == 0;
+  /*constexpr*/ bool debug_print  = false; //threadIdx.x == 0 && blockIdx.x == 0;
   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
   auto group                      = cooperative_groups::this_thread_block();
   extern __shared__ int8_t shared_data[];
@@ -1037,12 +1066,14 @@ __global__ void copy_to_columns(const size_type num_rows,
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+    printf("%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x);
     /*    printf("Row Offsets:\n");
     for (int i=0; i<num_rows; ++i) {
     printf("%d: %d\n", i, row_offsets[i]);
     }*/
-    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+    printf("Row offsets are at %p\n", row_offsets);
+    printf("col sizes are at %p and col offsets at %p\n", _col_sizes, _col_offsets);
+    printf("output data to %p and input data at %p\n", output_data[block_infos[blockIdx.x].buffer_num], input_data);
     printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
     printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
     printf("group is %d threads\n", group.size());
@@ -1227,7 +1258,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-  bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
+  bool print_debug = false; //threadIdx.x == 0 && blockIdx.x == 0;
   // bool print_debug = false;
   //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
   if (print_debug) {
@@ -1293,7 +1324,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
 
     auto const num_sections_x           = (num_block_cols + 7) / 8;
     auto const num_sections_y           = (num_block_rows + 31) / 32;
-    auto const validity_data_col_length = align_offset(num_sections_y, 4);
+    auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections           = num_sections_x * num_sections_y;
 
     if (print_debug) {
@@ -1332,8 +1363,8 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       auto const absolute_col = relative_col + block_start_col;
       auto const absolute_row = relative_row + block_start_row;
       auto const rows_left    = num_rows - absolute_row;
-
-      if (print_debug)
+  
+/*      if (print_debug)
         printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
                threadIdx.x,
                blockIdx.x,
@@ -1345,7 +1376,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
                absolute_row,
                num_rows,
                relative_col,
-               relative_row);
+               relative_row);*/
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
 
       if (absolute_row < num_rows) {
@@ -1362,12 +1393,16 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
           if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
               validity_data_col_length * (relative_col + i) + relative_row / 8;
+            auto const write_5006_offset = 837; //validity_data_col_length * (65 - block_start_col) + (5006 - block_start_row)/8;
 
             if (print_debug)
-              printf("%d - Writing validity data 0x%x to shared memory location %d\n",
+              printf("%d - Writing validity data for column %d, row %d 0x%x to shared memory location %d(%d * (%d + %d) + %d / 8)\n",
                      threadIdx.x,
+                     absolute_col+i,
+                     absolute_row,
                      validity_data,
-                     validity_write_offset);
+                     validity_write_offset, validity_data_col_length, relative_col, i, relative_row);
+
             if (rows_left <= 8) {
               // write byte
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
@@ -1400,11 +1435,25 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       if (print_debug)
-        printf("memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
+        printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
+        threadIdx.x, blockIdx.x, col,
                starting_address,
                word_index(block_start_row),
                &this_shared_block[validity_data_col_length * relative_col],
                words_to_copy * 4);
+/*      if (print_debug) {
+        auto const offset_5006 = validity_data_col_length * relative_col +  (5006 - block_start_row) / 8;
+        printf("%d %d - start_row %d end row %d - byte for row 5006 is offset 0x%x - 0x%x\n", threadIdx.x, blockIdx.x, block_start_row, block.end_row, offset_5006, this_shared_block[offset_5006]);
+        printf("relative column is %d and validity_data_col_length is %d making starting offset 0x%x\n", relative_col, validity_data_col_length, validity_data_col_length * relative_col);
+        for (int i=block_start_row; i<block.end_row; i+=8) {
+          auto const offset = validity_data_col_length * relative_col + (i - block_start_row) / 8;
+          if (i % (10 * 8) == 0) {
+            printf("\n");
+          }
+          printf("%4d(%4d) = 0x%X ", offset, i, this_shared_block[offset]);
+        }
+        printf("\n");
+      }*/
       cuda::memcpy_async(
         output_nm[col] + word_index(block_start_row),
         &this_shared_block[validity_data_col_length * relative_col],
@@ -1644,7 +1693,10 @@ std::vector<detail::block_info> build_validity_block_infos(
     }(),
     8);
   // we fit as much as we can given the column stride
-  auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride);
+  // note that an element in the table takes just 1 bit, but a row with a single
+  // element still takes 8 bytes!
+  auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
+  auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
 
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
@@ -1695,6 +1747,7 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
       }
       int const window_height = std::min(desired_window_height, rows_left_in_batch);
 
+//      printf("block %d, %d to %d, %d\n", start_col, current_window_start_row, end_col, std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1));
       block_infos.emplace_back(detail::block_info{
         start_col,
         current_window_start_row,
@@ -1716,11 +1769,7 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
   // bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
   int const window_height =
-    std::clamp(util::round_up_safe<int>(
-                 optimal_square_len <= (size_type)column_sizes.size()
-                   ? std::min(optimal_square_len / column_sizes[0], total_number_of_rows)
-                   : row_batches[0].row_count / 2,
-                 32),
+    std::clamp(util::round_up_safe<int>(std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], total_number_of_rows), 32),
                1,
                row_batches[0].row_count);
 #if defined(DEBUG)
@@ -1787,7 +1836,7 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
         shmem_limit_per_block);
 #endif
       // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
+      build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
       row_size =
         detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
 #if defined(DEBUG)
@@ -1973,6 +2022,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
             }
             c = c.child(1);
           }
+          exclusive_scan([t](int row_index) {
+            size_type total_row_size = 0;
+            for (int i=0 i<t.num_columns(); ++i) {
+              // compute data prior to validity
+              data_size += compute_type_size();
+              // compute validity size
+              total_row_size += num_columns() / 8;
+              total_row_size = align(data_size + bit_size + variable_size);
+            }
+          }
     */
   };
 
@@ -1984,6 +2043,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
   auto validity_size = num_bitmask_words(num_columns) * 4;
+  // thrust
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -2310,8 +2370,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
   auto const column_stride            = [&]() {
     if (desired_rows_and_columns > num_columns) {
-      // not many columns, group it into 8s and ship it off
-      return std::min(8, num_columns);
+      // not many columns, group it into 64s and ship it off
+      return std::min(64, num_columns);
     } else {
       return util::round_down_safe(desired_rows_and_columns, 8);
     }
@@ -2325,6 +2385,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
           return util::round_down_safe(desired_rows_and_columns, 32);
         }*/
   }();
+  printf("column stride is %d and row stride is %d. std::min(%d, util::round_down_safe(%d * 8 / %d, 32))\n", column_stride, row_stride, num_rows, shmem_limit_per_block, column_stride);
+  printf("each block uses %d bytes of shared memory\n", (column_stride / 8) * detail::align_offset(row_stride, 4));
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
     for (int row = 0; row < num_rows; row += row_stride) {
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index e38b37e81a6..26e071eef79 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -33,11 +33,19 @@ TEST_F(ColumnToRowTests, Single)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
   auto old_rows = cudf::old_convert_to_rows(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -47,11 +55,19 @@ TEST_F(ColumnToRowTests, Simple)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
   auto old_rows = cudf::old_convert_to_rows(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -63,11 +79,20 @@ TEST_F(ColumnToRowTests, Tall)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
   cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
   cudf::table_view in(std::vector<cudf::column_view>{a});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
   auto old_rows = cudf::old_convert_to_rows(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -77,10 +102,12 @@ TEST_F(ColumnToRowTests, Wide)
 {
   std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
   std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
 
   for (int i = 0; i < 256; ++i) {
     cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
     views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
   cudf::table_view in(views);
 
@@ -88,6 +115,13 @@ TEST_F(ColumnToRowTests, Wide)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -97,10 +131,13 @@ TEST_F(ColumnToRowTests, SingleByteWide)
 {
   std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
   std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
 
   for (int i = 0; i < 256; ++i) {
     cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
     views.push_back(cols.back());
+
+    schema.push_back(cudf::data_type{cudf::type_id::INT8});
   }
   cudf::table_view in(views);
 
@@ -108,6 +145,59 @@ TEST_F(ColumnToRowTests, SingleByteWide)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Non2Power)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  constexpr auto num_rows = 6 * 1024 + 557;
+  for (int i = 0; i < 131; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j=0; j<old_tbl->num_columns(); ++j) {
+      printf("testing column %d\n", j);
+      if (j==65) {
+        printf("old\n");
+        cudf::test::print(old_tbl->get_column(j));
+        printf("new\n");
+        cudf::test::print(new_tbl->get_column(j));
+      }
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -119,11 +209,69 @@ TEST_F(ColumnToRowTests, Big)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
   std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
   std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
 
-  for (int i = 0; i < 256; ++i) {
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 28; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Bigger)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 128 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
     cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + 4096 * i, r + 4096 * i + 4096));
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
     views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Biggest)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 128 columns of 2 million rows
+  constexpr auto num_rows = 2 * 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
   cudf::table_view in(views);
 
@@ -238,7 +386,7 @@ TEST_F(RowToColumnTests, SingleByteWide)
   }
 }
 
-TEST_F(RowToColumnTests, non2power)
+TEST_F(RowToColumnTests, Non2Power)
 {
   auto r =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
@@ -246,9 +394,13 @@ TEST_F(RowToColumnTests, non2power)
   std::vector<cudf::column_view> views;
   std::vector<cudf::data_type> schema;
 
-  cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r, r + 13));
-  views.push_back(cols.back());
-  schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  constexpr auto num_rows = 6 * 1024 + 557;
+  for (int i = 0; i < 131; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
   cudf::table_view in(views);
 
   auto old_rows = cudf::old_convert_to_rows(in);
@@ -269,9 +421,67 @@ TEST_F(RowToColumnTests, Big)
   std::vector<cudf::column_view> views;
   std::vector<cudf::data_type> schema;
 
-  for (int i = 0; i < 256; ++i) {
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 28; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Bigger)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Biggest)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 5 * 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
     cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + 4096 * i, r + 4096 * i + 4096));
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 1babbc6fd1a..9f0df3569a7 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -54,7 +54,9 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 #endif
 
 using cudf::detail::make_device_uvector_async;
-namespace cudf {
+using cudf::detail::warp_size;
+
+namespace cudf::java {
 
 namespace detail {
 
@@ -526,9 +528,9 @@ __global__ void copy_validity_from_columns(
         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
 
-    int const warp_id = threadIdx.x / detail::warp_size;
-    int const lane_id = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
@@ -557,7 +559,7 @@ __global__ void copy_validity_from_columns(
           // lead thread in each warp writes data
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
-          if (threadIdx.x % detail::warp_size == 0) {
+          if (threadIdx.x % warp_size == 0) {
             if (cols_left <= 8) {
               // write byte
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
@@ -855,12 +857,12 @@ __global__ void copy_validity_to_columns(
 
     auto const num_sections_x = (num_block_cols + 7) / 8;
     auto const num_sections_y = (num_block_rows + 31) / 32;
-    auto const validity_data_col_length = align_offset(num_sections_y, 4);
+    auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections = num_sections_x * num_sections_y;
 
-    int const warp_id = threadIdx.x / detail::warp_size;
-    int const lane_id = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
@@ -888,7 +890,7 @@ __global__ void copy_validity_to_columns(
              ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
-          if (threadIdx.x % detail::warp_size == 0) {
+          if (threadIdx.x % warp_size == 0) {
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
 

From ed5492eb80979d6f90ab18f64d6baf5006abf6a6 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 22 Sep 2021 03:11:58 +0000
Subject: [PATCH 17/80] Updates and bug fixes

---
 .../row_conversion/row_conversion.cpp         |   2 +-
 cpp/src/row_conversion/row_conversion.cu      | 206 +++++++-----------
 cpp/tests/row_conversion/row_conversion.cpp   |  36 +--
 java/src/main/native/src/row_conversion.cu    | 106 ++++-----
 4 files changed, 155 insertions(+), 195 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index ad9925e9043..2fe436a22c1 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -19,8 +19,8 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf/row_conversion.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/row_conversion.hpp>
 #include <cudf_test/column_utilities.hpp>
 
 class RowConversion : public cudf::benchmark {
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 0409a65b630..eb3c4b28b6a 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -568,11 +568,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-    int8_t* output_check_addr = nullptr;
-    int8_t* output_block_start = nullptr;
-    size_type output_block_size = 0;
-
-  bool print_debug = false; //threadIdx.x==0 && blockIdx.x == 0;
+  constexpr bool print_debug = false;  // threadIdx.x==0 && blockIdx.x == 0;
   //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
   if (print_debug) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -663,7 +659,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
 
     if (print_debug)
       printf(
-        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, warp size "
+        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, "
+        "warp size "
         "%d\n",
         threadIdx.x,
         blockIdx.x,
@@ -681,7 +678,12 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
 
-      if (print_debug) printf("working on section %d,%d - %d of %d...\n", section_x, section_y, my_section_idx, total_sections);
+      if (print_debug)
+        printf("working on section %d,%d - %d of %d...\n",
+               section_x,
+               section_y,
+               my_section_idx,
+               total_sections);
       auto const relative_col = section_x * 32 + lane_id;
       auto const relative_row = section_y * 8;
       auto const absolute_col = relative_col + block.start_col;
@@ -751,14 +753,18 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
               // write byte
               if (print_debug)
                 printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n",
-                threadIdx.x, blockIdx.x, validity_write_offset,
+                       threadIdx.x,
+                       blockIdx.x,
+                       validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
             } else if (cols_left <= 16) {
               // write int16
               if (print_debug)
                 printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n",
-                threadIdx.x, blockIdx.x, validity_write_offset,
+                       threadIdx.x,
+                       blockIdx.x,
+                       validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data & 0xFFFF;
@@ -766,7 +772,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
               // write int16 and then int8
               if (print_debug)
                 printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n",
-                threadIdx.x, blockIdx.x, validity_write_offset,
+                       threadIdx.x,
+                       blockIdx.x,
+                       validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data & 0xFFFF;
@@ -775,7 +783,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
               // write int32
               if (print_debug)
                 printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n",
-                threadIdx.x, blockIdx.x, validity_write_offset,
+                       threadIdx.x,
+                       blockIdx.x,
+                       validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data;
@@ -823,63 +833,20 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
         output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
 
-/*      if (num_rows >= 5006) {
-        auto const row5006_col_65 = output_data[block.buffer_num] + row_offsets[5006] + validity_offset + 65 / 8;
-        if (output_ptr >= row5006_col_65 && output_ptr <= row5006_col_65 + 4) {
-          printf("%d %d - writing bytes from %p(0x%x)-%p to %p-%p that overlap global %p(0x%x), which is row 5006, col 65!\n", threadIdx.x, blockIdx.x, &this_shared_block[validity_data_row_length * relative_row], this_shared_block[validity_data_row_length * relative_row], &this_shared_block[validity_data_row_length * relative_row + num_bytes], output_ptr, output_ptr + num_bytes, row5006_col_65, *row5006_col_65);
-          printf("%d %d - block information\n%d,%d -> %d,%d\n%d columns, %d rows\n", threadIdx.x, blockIdx.x, block.start_col, block.start_row, block.end_col, block.end_row, block.num_cols(), block.num_rows());
-          output_check_addr = row5006_col_65;
-          output_block_start = output_ptr;
-          output_block_size = num_bytes;
-        }
-      }*/
-
       cuda::memcpy_async(
         output_ptr,
         &this_shared_block[validity_data_row_length * relative_row],
         num_bytes,
         shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-
-      /*      auto const padding_ptr = output_ptr + num_bytes;
-            auto const padding_needed = -reinterpret_cast<int64_t>(padding_ptr) & 7;
-            if (print_debug) printf(
-                "absolute_row: %d, row_offset for this row: 0x%x, validity data bytes: %d, end
-         address: %p, padding bytes %lu\n", row, row_offsets[row], num_bytes, output_ptr +
-         num_bytes, padding_needed); cuda::memcpy_async(padding_ptr, zero, padding_needed,
-         shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-      */
-
-      /*      if (print_debug) {
-              for (int i=0; i<util::div_rounding_up_unsafe(num_block_rows, 8); i+=4) {
-                printf("%d: 0x%02X %02X %02X %02X\n", i * 8,
-         this_shared_block[validity_data_col_length * relative_col + i] & 0xFF,
-         this_shared_block[validity_data_col_length * relative_col + i + 1] & 0xFF,
-         this_shared_block[validity_data_col_length * col + i + 2] & 0xFF,
-         this_shared_block[validity_data_col_length * relative_col + i + 3] & 0xFF);
-              }
-            }*/
     }
-    //    if (print_debug) printf("looping...\n");
   }
 
-  //  if (print_debug) printf("leaving...\n");
   // wait for last blocks of data to arrive
   for (int validity_block = 0;
        validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
        ++validity_block) {
     shared_block_barriers[validity_block].arrive_and_wait();
   }
-  if (output_check_addr != nullptr) {
-    printf("output check after write to %p - 0x%x\n", output_check_addr, *output_check_addr);
-    for (int i=0; i<output_block_size; ++i) {
-        if (i % 16 == 0) {
-          printf("\n%p - ", &output_block_start[i]);
-        }
-        printf("0x%x ", output_block_start[i]);
-    }
-    printf("\n");
-  }
-
 }
 
 static __device__ std::tuple<size_type, size_type> get_admin_data_sizes(size_t col_size_size,
@@ -932,10 +899,16 @@ static __device__ void fetch_blocks_for_row_to_column(
        ++fetch_index) {
     auto const fetch_block =
       block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
-      if (debug_print)
-      printf("fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending offset %p\n",
-             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
-             total_blocks, fetch_block.start_col, fetch_block.end_col, &col_offsets[fetch_block.start_col], &col_offsets[fetch_block.end_col]);
+    if (debug_print)
+      printf(
+        "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending "
+        "offset %p\n",
+        blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
+        total_blocks,
+        fetch_block.start_col,
+        fetch_block.end_col,
+        &col_offsets[fetch_block.start_col],
+        &col_offsets[fetch_block.end_col]);
     auto const fetch_block_start_row = fetch_block.start_row;
     auto const fetch_block_end_row   = fetch_block.end_row;
     auto const starting_col_offset   = col_offsets[fetch_block.start_col];
@@ -977,7 +950,7 @@ static __device__ void fetch_blocks_for_row_to_column(
              &shared[fetch_index % max_resident_blocks][shared_row_offset],
              &col_offsets[fetch_block.start_col],
              col_offset_bytes);
-   cuda::memcpy_async(group,
+    cuda::memcpy_async(group,
                        &shared[fetch_index % max_resident_blocks][shared_row_offset],
                        &col_offsets[fetch_block.start_col],
                        col_offset_bytes,
@@ -985,23 +958,6 @@ static __device__ void fetch_blocks_for_row_to_column(
     shared_row_offset += col_offset_bytes;
     shared_row_offset = align_offset(shared_row_offset, 8);
 
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0 && fetch_block.start_col == 0 &&
-        fetch_block.start_row <= 51 && fetch_block.end_row >= 51) {
-      printf("Input data for col 0 row 51 is 0x");
-      for (int i = 0; i < col_sizes[0]; ++i) {
-        printf("%x ", input_data[row_offsets[51] + col_offsets[0] + i]);
-      }
-      printf("\n");
-      printf(
-        "this is at offset %d-%d and starting column offset is %d and we're reading %d bytes\n",
-        col_offsets[0],
-        col_offsets[0] + col_sizes[0],
-        starting_col_offset,
-        fetch_block_row_size);
-      auto shared_offset = (51 - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
-      printf("destination is %p", &shared[fetch_index % max_resident_blocks][shared_offset]);
-    }
-
     for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
          row <= fetch_block_end_row;
          row += blockDim.x) {
@@ -1012,7 +968,7 @@ static __device__ void fetch_blocks_for_row_to_column(
                fetch_index % max_resident_blocks,
                &shared[fetch_index % max_resident_blocks][shared_offset],
                &input_data[row_offsets[row] + starting_col_offset]);
-       // copy the main
+      // copy the main
       cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
                          &input_data[row_offsets[row] + starting_col_offset],
                          fetch_block_row_size,
@@ -1058,7 +1014,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
   // to shared memory for each of the blocks that we work on
 
-  /*constexpr*/ bool debug_print  = false; //threadIdx.x == 0 && blockIdx.x == 0;
+  /*constexpr*/ bool debug_print  = false;  // threadIdx.x == 0 && blockIdx.x == 0;
   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
   auto group                      = cooperative_groups::this_thread_block();
   extern __shared__ int8_t shared_data[];
@@ -1066,14 +1022,17 @@ __global__ void copy_to_columns(const size_type num_rows,
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x);
+    printf(
+      "%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x);
     /*    printf("Row Offsets:\n");
     for (int i=0; i<num_rows; ++i) {
     printf("%d: %d\n", i, row_offsets[i]);
     }*/
     printf("Row offsets are at %p\n", row_offsets);
     printf("col sizes are at %p and col offsets at %p\n", _col_sizes, _col_offsets);
-    printf("output data to %p and input data at %p\n", output_data[block_infos[blockIdx.x].buffer_num], input_data);
+    printf("output data to %p and input data at %p\n",
+           output_data[block_infos[blockIdx.x].buffer_num],
+           input_data);
     printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
     printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
     printf("group is %d threads\n", group.size());
@@ -1258,7 +1217,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-  bool print_debug = false; //threadIdx.x == 0 && blockIdx.x == 0;
+  bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
   // bool print_debug = false;
   //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
   if (print_debug) {
@@ -1324,7 +1283,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
 
     auto const num_sections_x           = (num_block_cols + 7) / 8;
     auto const num_sections_y           = (num_block_rows + 31) / 32;
-    auto const validity_data_col_length = num_sections_y * 4; // words to bytes
+    auto const validity_data_col_length = num_sections_y * 4;  // words to bytes
     auto const total_sections           = num_sections_x * num_sections_y;
 
     if (print_debug) {
@@ -1363,20 +1322,20 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       auto const absolute_col = relative_col + block_start_col;
       auto const absolute_row = relative_row + block_start_row;
       auto const rows_left    = num_rows - absolute_row;
-  
-/*      if (print_debug)
-        printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
-               threadIdx.x,
-               blockIdx.x,
-               my_section_idx,
-               num_sections_x,
-               num_sections_y,
-               section_x,
-               section_y,
-               absolute_row,
-               num_rows,
-               relative_col,
-               relative_row);*/
+
+      /*      if (print_debug)
+              printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
+                     threadIdx.x,
+                     blockIdx.x,
+                     my_section_idx,
+                     num_sections_x,
+                     num_sections_y,
+                     section_x,
+                     section_y,
+                     absolute_row,
+                     num_rows,
+                     relative_col,
+                     relative_row);*/
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
 
       if (absolute_row < num_rows) {
@@ -1393,15 +1352,22 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
           if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
               validity_data_col_length * (relative_col + i) + relative_row / 8;
-            auto const write_5006_offset = 837; //validity_data_col_length * (65 - block_start_col) + (5006 - block_start_row)/8;
+            auto const write_5006_offset = 837;  // validity_data_col_length * (65 -
+                                                 // block_start_col) + (5006 - block_start_row)/8;
 
             if (print_debug)
-              printf("%d - Writing validity data for column %d, row %d 0x%x to shared memory location %d(%d * (%d + %d) + %d / 8)\n",
-                     threadIdx.x,
-                     absolute_col+i,
-                     absolute_row,
-                     validity_data,
-                     validity_write_offset, validity_data_col_length, relative_col, i, relative_row);
+              printf(
+                "%d - Writing validity data for column %d, row %d 0x%x to shared memory location "
+                "%d(%d * (%d + %d) + %d / 8)\n",
+                threadIdx.x,
+                absolute_col + i,
+                absolute_row,
+                validity_data,
+                validity_write_offset,
+                validity_data_col_length,
+                relative_col,
+                i,
+                relative_row);
 
             if (rows_left <= 8) {
               // write byte
@@ -1436,24 +1402,13 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
 
       if (print_debug)
         printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
-        threadIdx.x, blockIdx.x, col,
+               threadIdx.x,
+               blockIdx.x,
+               col,
                starting_address,
                word_index(block_start_row),
                &this_shared_block[validity_data_col_length * relative_col],
                words_to_copy * 4);
-/*      if (print_debug) {
-        auto const offset_5006 = validity_data_col_length * relative_col +  (5006 - block_start_row) / 8;
-        printf("%d %d - start_row %d end row %d - byte for row 5006 is offset 0x%x - 0x%x\n", threadIdx.x, blockIdx.x, block_start_row, block.end_row, offset_5006, this_shared_block[offset_5006]);
-        printf("relative column is %d and validity_data_col_length is %d making starting offset 0x%x\n", relative_col, validity_data_col_length, validity_data_col_length * relative_col);
-        for (int i=block_start_row; i<block.end_row; i+=8) {
-          auto const offset = validity_data_col_length * relative_col + (i - block_start_row) / 8;
-          if (i % (10 * 8) == 0) {
-            printf("\n");
-          }
-          printf("%4d(%4d) = 0x%X ", offset, i, this_shared_block[offset]);
-        }
-        printf("\n");
-      }*/
       cuda::memcpy_async(
         output_nm[col] + word_index(block_start_row),
         &this_shared_block[validity_data_col_length * relative_col],
@@ -1462,7 +1417,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
     }
   }
 
-  //  if (print_debug) printf("leaving...\n");
   // wait for last blocks of data to arrive
   auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
                                     ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
@@ -1696,7 +1650,7 @@ std::vector<detail::block_info> build_validity_block_infos(
   // note that an element in the table takes just 1 bit, but a row with a single
   // element still takes 8 bytes!
   auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
-  auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
+  auto const row_stride    = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
 
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
@@ -1747,7 +1701,6 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
       }
       int const window_height = std::min(desired_window_height, rows_left_in_batch);
 
-//      printf("block %d, %d to %d, %d\n", start_col, current_window_start_row, end_col, std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1));
       block_infos.emplace_back(detail::block_info{
         start_col,
         current_window_start_row,
@@ -1768,10 +1721,13 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
   // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
   // bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-  int const window_height =
-    std::clamp(util::round_up_safe<int>(std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], total_number_of_rows), 32),
-               1,
-               row_batches[0].row_count);
+  int const window_height            = std::clamp(
+    util::round_up_safe<int>(
+      std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
+               total_number_of_rows),
+      32),
+    1,
+    row_batches[0].row_count);
 #if defined(DEBUG)
   printf(
     "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d "
@@ -2385,8 +2341,6 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
           return util::round_down_safe(desired_rows_and_columns, 32);
         }*/
   }();
-  printf("column stride is %d and row stride is %d. std::min(%d, util::round_down_safe(%d * 8 / %d, 32))\n", column_stride, row_stride, num_rows, shmem_limit_per_block, column_stride);
-  printf("each block uses %d bytes of shared memory\n", (column_stride / 8) * detail::align_offset(row_stride, 4));
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
     for (int row = 0; row < num_rows; row += row_stride) {
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 26e071eef79..70a4552a6f9 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -168,8 +168,8 @@ TEST_F(ColumnToRowTests, Non2Power)
 
   constexpr auto num_rows = 6 * 1024 + 557;
   for (int i = 0; i < 131; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -184,9 +184,9 @@ TEST_F(ColumnToRowTests, Non2Power)
     auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
-    for (int j=0; j<old_tbl->num_columns(); ++j) {
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
       printf("testing column %d\n", j);
-      if (j==65) {
+      if (j == 65) {
         printf("old\n");
         cudf::test::print(old_tbl->get_column(j));
         printf("new\n");
@@ -214,8 +214,8 @@ TEST_F(ColumnToRowTests, Big)
   // 28 columns of 1 million rows
   constexpr auto num_rows = 1024 * 1024;
   for (int i = 0; i < 28; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -241,8 +241,8 @@ TEST_F(ColumnToRowTests, Bigger)
   // 128 columns of 1 million rows
   constexpr auto num_rows = 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -268,8 +268,8 @@ TEST_F(ColumnToRowTests, Biggest)
   // 128 columns of 2 million rows
   constexpr auto num_rows = 2 * 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -396,8 +396,8 @@ TEST_F(RowToColumnTests, Non2Power)
 
   constexpr auto num_rows = 6 * 1024 + 557;
   for (int i = 0; i < 131; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -424,8 +424,8 @@ TEST_F(RowToColumnTests, Big)
   // 28 columns of 1 million rows
   constexpr auto num_rows = 1024 * 1024;
   for (int i = 0; i < 28; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -452,8 +452,8 @@ TEST_F(RowToColumnTests, Bigger)
   // 28 columns of 1 million rows
   constexpr auto num_rows = 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -480,8 +480,8 @@ TEST_F(RowToColumnTests, Biggest)
   // 28 columns of 1 million rows
   constexpr auto num_rows = 5 * 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 9f0df3569a7..c64a61b3373 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -54,9 +54,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 #endif
 
 using cudf::detail::make_device_uvector_async;
-using cudf::detail::warp_size;
-
-namespace cudf::java {
+namespace cudf {
 
 namespace detail {
 
@@ -403,7 +401,6 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     // Fetch ahead up to stages_count subsets
     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
       auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
-
       auto const num_fetch_cols = fetch_block.num_cols();
       auto const num_fetch_rows = fetch_block.num_rows();
       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
@@ -435,7 +432,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
         auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
 
-        // copy the main
+        // copy the element to global memory
         cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size,
                            fetch_barrier);
       }
@@ -445,18 +442,19 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     subset_barrier.arrive_and_wait();
 
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-    /*    auto const rows_in_block  = block.num_rows();
-        auto const cols_in_block  = block.num_cols();*/
+
     auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
 
     // copy entire rows to final dest
     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
          absolute_row += blockDim.x) {
+
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
           output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
       auto const shared_offset = block_row_size * relative_row;
+
       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
                          subset_barrier);
     }
@@ -528,23 +526,22 @@ __global__ void copy_validity_from_columns(
         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
 
-    int const warp_id = threadIdx.x / warp_size;
-    int const lane_id = threadIdx.x % warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
+    int const warp_id = threadIdx.x / detail::warp_size;
+    int const lane_id = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
          my_section_idx += warps_per_block) {
-      // convert to rows and cols
-      auto const section_x = my_section_idx / num_sections_x;
-      auto const section_y = my_section_idx % num_sections_x;
 
+      // convert to rows and cols
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
       auto const relative_col = section_x * 32 + lane_id;
       auto const relative_row = section_y * 8;
       auto const absolute_col = relative_col + block.start_col;
       auto const absolute_row = relative_row + block.start_row;
       auto const cols_left = num_columns - absolute_col;
-
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
 
       if (absolute_col < num_columns) {
@@ -552,14 +549,14 @@ __global__ void copy_validity_from_columns(
             input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
 
         // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make
+        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make
         // the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
-          if (threadIdx.x % warp_size == 0) {
+          if (threadIdx.x % detail::warp_size == 0) {
             if (cols_left <= 8) {
               // write byte
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
@@ -591,6 +588,7 @@ __global__ void copy_validity_from_columns(
       auto const output_ptr =
           output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+
       cuda::memcpy_async(
           output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes,
           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
@@ -647,7 +645,6 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde
     auto const fetch_block_start_row = fetch_block.start_row;
     auto const fetch_block_end_row = fetch_block.end_row;
     auto const starting_col_offset = col_offsets[fetch_block.start_col];
-
     auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
     auto const num_fetch_cols = fetch_block.num_cols();
     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
@@ -718,9 +715,9 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
   extern __shared__ int8_t shared_data[];
   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
 
-  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[stages_count];
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
   if (group.thread_rank() == 0) {
-    for (int i = 0; i < stages_count; ++i) {
+    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
       init(&block_barrier[i], group.size());
     }
   }
@@ -748,12 +745,11 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
                                    block_infos, _col_sizes, _col_offsets, row_offsets, input_data,
                                    shared, group, block_barrier);
 
-    auto &subset_barrier = block_barrier[subset % stages_count];
+    auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     // ensure our data is ready
     subset_barrier.arrive_and_wait();
 
-    auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
     auto const rows_in_block = block.num_rows();
     auto const cols_in_block = block.num_cols();
 
@@ -851,18 +847,15 @@ __global__ void copy_validity_to_columns(
     auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
     auto const block_start_col = block.start_col;
     auto const block_start_row = block.start_row;
-
     auto const num_block_cols = block.num_cols();
     auto const num_block_rows = block.num_rows();
-
     auto const num_sections_x = (num_block_cols + 7) / 8;
     auto const num_sections_y = (num_block_rows + 31) / 32;
     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections = num_sections_x * num_sections_y;
-
-    int const warp_id = threadIdx.x / warp_size;
-    int const lane_id = threadIdx.x % warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
+    int const warp_id = threadIdx.x / detail::warp_size;
+    int const lane_id = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
@@ -870,7 +863,6 @@ __global__ void copy_validity_to_columns(
       // convert to rows and cols
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
-
       auto const relative_col = section_x * 8;
       auto const relative_row = section_y * 32 + lane_id;
       auto const absolute_col = relative_col + block_start_col;
@@ -890,9 +882,11 @@ __global__ void copy_validity_to_columns(
              ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
-          if (threadIdx.x % warp_size == 0) {
+          if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
+            auto const write_5006_offset = 837; // validity_data_col_length * (65 - block_start_col)
+                                                // + (5006 - block_start_row)/8;
 
             if (rows_left <= 8) {
               // write byte
@@ -922,6 +916,8 @@ __global__ void copy_validity_to_columns(
     // now async memcpy the shared
     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
       auto const relative_col = col - block.start_col;
+      auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32);
+      auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       cuda::memcpy_async(
           output_nm[col] + word_index(block_start_row),
@@ -965,8 +961,9 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
   int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4);
-  if (y_block_size > 32)
+  if (y_block_size > 32) {
     y_block_size = 32;
+  }
   int x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
   // If someone configures the GPU to only have 16 KB this might not work.
@@ -1135,7 +1132,10 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro
       }(),
       8);
   // we fit as much as we can given the column stride
-  auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride);
+  // note that an element in the table takes just 1 bit, but a row with a single
+  // element still takes 8 bytes!
+  auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
+  auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
 
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
@@ -1203,13 +1203,12 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
   // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
   // bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-  int const window_height =
-      std::clamp(util::round_up_safe<int>(
-                     optimal_square_len <= (size_type)column_sizes.size() ?
-                         std::min(optimal_square_len / column_sizes[0], total_number_of_rows) :
-                         row_batches[0].row_count / 2,
-                     32),
-                 1, row_batches[0].row_count);
+  int const window_height = std::clamp(
+      util::round_up_safe<int>(
+          std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
+                   total_number_of_rows),
+          32),
+      1, row_batches[0].row_count);
 
   auto calc_admin_data_size = [](int num_cols) -> size_type {
     // admin data is the column sizes and column start information.
@@ -1233,8 +1232,9 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
     if (row_size_with_end_pad * window_height +
             calc_admin_data_size(col - current_window_start_col) >
         shmem_limit_per_block) {
+
       // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
+      build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
       row_size =
           detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
       row_size += col_size; // alignment required for shared memory window boundary to match
@@ -1274,9 +1274,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   int total_shmem;
   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  // TODO: kernels fail to launch if we use all the available shared memory.
+  // TODO: why?
   total_shmem -= 1024;
-
   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
 
   // break up the work into blocks, which are a starting and ending row/col #.
@@ -1381,6 +1380,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
             }
             c = c.child(1);
           }
+          exclusive_scan([t](int row_index) {
+            size_type total_row_size = 0;
+            for (int i=0 i<t.num_columns(); ++i) {
+              // compute data prior to validity
+              data_size += compute_type_size();
+              // compute validity size
+              total_row_size += num_columns() / 8;
+              total_row_size = align(data_size + bit_size + variable_size);
+            }
+          }
     */
   };
 
@@ -1392,6 +1401,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
   auto validity_size = num_bitmask_words(num_columns) * 4;
+  // thrust
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
         detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
@@ -1578,7 +1588,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   int total_shmem;
   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  // TODO: unable to launch a kernel with all shared used
+  // TODO why?
   total_shmem -= 1024;
   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
 
@@ -1628,11 +1638,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
-#if defined(DEBUG)
-  dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size()));
-#else
-  dim3 threads(std::min(256, (int)child.size()));
-#endif
+  dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
   detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
       dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
@@ -1641,8 +1647,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
   auto const column_stride = [&]() {
     if (desired_rows_and_columns > num_columns) {
-      // not many columns, group it into 8s and ship it off
-      return std::min(8, num_columns);
+      // not many columns, group it into 64s and ship it off
+      return std::min(64, num_columns);
     } else {
       return util::round_down_safe(desired_rows_and_columns, 8);
     }

From 02cb81b95d53ad2e8330fcf768f55fc1502d707d Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Fri, 1 Oct 2021 15:14:54 +0000
Subject: [PATCH 18/80] Fixing merge issue

---
 cpp/benchmarks/CMakeLists.txt | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 5cc48436d01..7d353c37df7 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -29,6 +29,7 @@ target_link_libraries(cudf_datagen
                       GTest::gmock_main
                       GTest::gtest_main
                       benchmark::benchmark
+                      nvbench::nvbench
                       Threads::Threads
                       cudf)
 
@@ -50,11 +51,19 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen)
 function(ConfigureBench CMAKE_BENCH_NAME)
     add_executable(${CMAKE_BENCH_NAME} ${ARGN})
     set_target_properties(${CMAKE_BENCH_NAME}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gbenchmarks>")
+        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>")
     target_link_libraries(${CMAKE_BENCH_NAME}
         PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main)
 endfunction()
 
+function(ConfigureNVBench CMAKE_BENCH_NAME)
+    add_executable(${CMAKE_BENCH_NAME} ${ARGN})
+    set_target_properties(${CMAKE_BENCH_NAME}
+        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>")
+    target_link_libraries(${CMAKE_BENCH_NAME}
+        PRIVATE cudf_benchmark_common cudf_datagen nvbench::main)
+endfunction()
+
 ###################################################################################################
 # - column benchmarks -----------------------------------------------------------------------------
 ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp)
@@ -67,6 +76,10 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu)
 # - scatter benchmark -----------------------------------------------------------------------------
 ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu)
 
+###################################################################################################
+# - lists scatter benchmark -----------------------------------------------------------------------
+ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu)
+
 ###################################################################################################
 # - contiguous_split benchmark  -------------------------------------------------------------------
 ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
@@ -89,7 +102,8 @@ ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchma
 
 ###################################################################################################
 # - join benchmark --------------------------------------------------------------------------------
-ConfigureBench(JOIN_BENCH join/join_benchmark.cu)
+ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
+ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
 
 ###################################################################################################
 # - iterator benchmark ----------------------------------------------------------------------------
@@ -191,6 +205,7 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 # - binaryop benchmark ----------------------------------------------------------------------------
 ConfigureBench(BINARYOP_BENCH
   binaryop/binaryop_benchmark.cpp
+  binaryop/compiled_binaryop_benchmark.cpp
   binaryop/jit_binaryop_benchmark.cpp)
 
 ###################################################################################################
@@ -218,6 +233,7 @@ ConfigureBench(STRINGS_BENCH
   string/factory_benchmark.cu
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
+  string/repeat_strings_benchmark.cpp
   string/replace_benchmark.cpp
   string/replace_re_benchmark.cpp
   string/split_benchmark.cpp
@@ -231,5 +247,10 @@ ConfigureBench(JSON_BENCH
   string/json_benchmark.cpp)
 
 ###################################################################################################
-# - row conversion benchmark ----------------------------------------------------------------------------
+# - io benchmark ---------------------------------------------------------------------
+ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
+  io/text/multibyte_split_benchmark.cpp)
+
+###################################################################################################
+# - row conversion benchmark ---------------------------------------------------------
 ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp)

From bae16f6976bda67f46e24e399b014ea1f7aff38d Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Fri, 1 Oct 2021 15:17:11 +0000
Subject: [PATCH 19/80] working on code to move block creation and batch
 creation to gpu

---
 cpp/src/row_conversion/row_conversion.cu    | 180 +++++++++++++++++++-
 cpp/tests/row_conversion/row_conversion.cpp |   7 -
 2 files changed, 178 insertions(+), 9 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index eb3c4b28b6a..ae218e637d0 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -20,6 +20,8 @@
 #include <limits>
 #include <tuple>
 #include <type_traits>
+#include "cudf/detail/iterator.cuh"
+#include "cudf/lists/lists_column_device_view.cuh"
 
 #include <cooperative_groups.h>
 
@@ -43,7 +45,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -56,6 +60,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 #endif
 
 using cudf::detail::make_device_uvector_async;
+using rmm::device_uvector;
 namespace cudf {
 
 namespace detail {
@@ -1352,8 +1357,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
           if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
               validity_data_col_length * (relative_col + i) + relative_row / 8;
-            auto const write_5006_offset = 837;  // validity_data_col_length * (65 -
-                                                 // block_start_col) + (5006 - block_start_row)/8;
 
             if (print_debug)
               printf(
@@ -1674,6 +1677,173 @@ std::vector<detail::block_info> build_validity_block_infos(
   return validity_block_infos;
 }
 
+constexpr size_t max_batch_size = 1024;  // 2ul * 1024 * 1024 * 1024;
+
+template <typename CumulativeRowSize>
+void build_batches(size_t total_size,
+                   size_type num_rows,
+                   CumulativeRowSize cumulative_row_size,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+{
+  auto const num_batches = ((total_size + (max_batch_size - 1)) / max_batch_size);
+  auto const num_offsets = num_batches + 1;
+  printf("%lu batches so %lu offsets\n", num_batches, num_offsets);
+
+  // at most max gpu memory / 2GB iterations.
+  std::vector<size_type> h_batch_row_offsets;
+  h_batch_row_offsets.reserve(num_offsets);
+  h_batch_row_offsets.push_back(0);
+  size_type last_row_end = 0;
+  while (h_batch_row_offsets.size() < num_batches) {
+    // subtract out the size of the last row in the previous batch
+    auto adjusted_row_size =
+      thrust::make_transform_iterator(cumulative_row_size + last_row_end,
+                                      [last_row_end, cumulative_row_size] __device__(size_t size) {
+                                        return size - cumulative_row_size[last_row_end];
+                                      });
+    // find the next max_batch_size boundary
+    size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream),
+                                                    adjusted_row_size,
+                                                    adjusted_row_size + (num_rows - last_row_end),
+                                                    max_batch_size) -
+                                adjusted_row_size) +
+                               last_row_end) -
+                              1;
+
+    h_batch_row_offsets.push_back(row_end);
+    last_row_end = row_end;
+  }
+  printf("batches: ");
+  for (uint i = 0; i < h_batch_row_offsets.size(); ++i) {
+    printf("%d ", h_batch_row_offsets[i]);
+  }
+  printf("\n");
+}
+
+int compute_block_counts(device_uvector<size_type> const& batch_row_offsets,
+                         int desired_window_height,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+{
+  size_type const num_batches = batch_row_offsets.size() - 1;
+  device_uvector<size_type> num_blocks(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    iter,
+    iter + num_batches,
+    num_blocks.begin(),
+    [desired_window_height,
+     batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
+      return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) /
+             desired_window_height;
+    });
+  return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+}
+
+size_type block_lambda(
+  block_info* blocks,
+  device_uvector<size_type> const& batch_row_offsets,  // comes from build_batches
+  int column_start,
+  int column_end,
+  int desired_window_height,
+  int total_number_of_rows,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  size_type const num_batches = batch_row_offsets.size() - 1;
+  device_uvector<size_type> num_blocks(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    iter,
+    iter + num_batches,
+    num_blocks.begin(),
+    [=, batch_row_offsets = batch_row_offsets.data()] __device__(int batch_index) -> size_type {
+      return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) /
+             desired_window_height;
+    });
+  size_type const total_blocks =
+    thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+  device_uvector<size_type> block_starts(num_batches, stream);
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         num_blocks.begin(),
+                         num_blocks.end(),
+                         block_starts.begin());  // in blocks
+
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    iter,
+    iter + total_blocks,
+    [                  =,
+     block_starts      = block_starts.data(),
+     batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) {
+      block_info& bi = blocks[block_index];
+
+      // what batch this block falls in
+      auto const batch_index_iter =
+        thrust::lower_bound(thrust::seq, block_starts, block_starts + num_batches, block_index);
+      auto const batch_index = batch_index_iter == block_starts ? 0 : *batch_index_iter;
+      // local index within the block
+      int const local_block_index = block_index - block_starts[batch_index];
+      // the start row for this batch.
+      int const batch_row_start = batch_row_offsets[batch_index];
+      // the start row for this block
+      int const block_row_start = batch_row_start + (local_block_index * desired_window_height);
+      // the end row for this block
+      int const max_row = std::min(total_number_of_rows,
+                                   batch_index + 1 > num_batches
+                                     ? std::numeric_limits<int>::max()
+                                     : static_cast<int>(batch_row_offsets[batch_index + 1]));
+      int const block_row_end =
+        std::min(batch_row_start + ((local_block_index + 1) * desired_window_height) - 1,
+                 total_number_of_rows);
+
+      // stuff the block
+      bi.start_col  = column_start;
+      bi.end_col    = column_end;
+      bi.start_row  = block_row_start;
+      bi.end_row    = block_row_end;
+      bi.buffer_num = batch_index;
+    });
+
+  return total_blocks;
+}
+
+void test_block_lambda(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+{
+  device_uvector<size_type> batch_row_offsets(3, stream);
+  batch_row_offsets.set_element(0, 0, stream);
+  batch_row_offsets.set_element(1, 2000, stream);
+  batch_row_offsets.set_element(2, 5000, stream);
+
+  // three groups of columns that can hold 128, 1024, and 768 rows each.
+  auto const total_blocks = compute_block_counts(batch_row_offsets, 128, stream, mr) +
+                            compute_block_counts(batch_row_offsets, 1024, stream, mr) +
+                            compute_block_counts(batch_row_offsets, 768, stream, mr);
+
+  auto const table_num_rows = 50 * 1024;
+
+  // allocate memory for all blocks
+  device_uvector<block_info> blocks(total_blocks, stream);
+
+  auto used_blocks =
+    block_lambda(blocks.data(), batch_row_offsets, 0, 15, 128, table_num_rows, stream, mr);
+  used_blocks += block_lambda(
+    blocks.data() + used_blocks, batch_row_offsets, 16, 28, 1024, table_num_rows, stream, mr);
+  used_blocks += block_lambda(
+    blocks.data() + used_blocks, batch_row_offsets, 29, 32, 768, table_num_rows, stream, mr);
+
+  CUDF_EXPECTS(used_blocks == total_blocks, "used not equal to total!");
+
+  for (int i = 0; i < total_blocks; ++i) {
+    auto const block = blocks.element(i, stream);
+    printf(
+      "%d: %d,%d -> %d,%d\n", i, block.start_col, block.start_row, block.end_col, block.end_row);
+  }
+}
+
 std::vector<block_info> build_block_infos(std::vector<size_type> const& column_sizes,
                                           std::vector<size_type> const& column_starts,
                                           std::vector<row_batch> const& row_batches,
@@ -2245,6 +2415,12 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
   cudf::size_type num_columns = schema.size();
   cudf::size_type num_rows    = input.parent().size();
 
+  auto cumulative_row_size = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(size_t row_index) { return 300 * row_index; });
+  detail::build_batches(1024 * 1024, 1024, cumulative_row_size, stream, mr);
+
+  detail::test_block_lambda(stream, mr);
+
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem;
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 70a4552a6f9..48d9690d583 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -185,13 +185,6 @@ TEST_F(ColumnToRowTests, Non2Power)
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      printf("testing column %d\n", j);
-      if (j == 65) {
-        printf("old\n");
-        cudf::test::print(old_tbl->get_column(j));
-        printf("new\n");
-        cudf::test::print(new_tbl->get_column(j));
-      }
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
     }
 

From 36568485ec7caaec1ab5188b1cb5a1fbaea45b51 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 6 Oct 2021 19:41:49 +0000
Subject: [PATCH 20/80] pulling incomplete code for gpu building block data

---
 cpp/src/row_conversion/row_conversion.cu   | 173 ---------------------
 java/src/main/native/src/row_conversion.cu |  53 +------
 2 files changed, 6 insertions(+), 220 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index ae218e637d0..9674000a69d 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -1677,173 +1677,6 @@ std::vector<detail::block_info> build_validity_block_infos(
   return validity_block_infos;
 }
 
-constexpr size_t max_batch_size = 1024;  // 2ul * 1024 * 1024 * 1024;
-
-template <typename CumulativeRowSize>
-void build_batches(size_t total_size,
-                   size_type num_rows,
-                   CumulativeRowSize cumulative_row_size,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-{
-  auto const num_batches = ((total_size + (max_batch_size - 1)) / max_batch_size);
-  auto const num_offsets = num_batches + 1;
-  printf("%lu batches so %lu offsets\n", num_batches, num_offsets);
-
-  // at most max gpu memory / 2GB iterations.
-  std::vector<size_type> h_batch_row_offsets;
-  h_batch_row_offsets.reserve(num_offsets);
-  h_batch_row_offsets.push_back(0);
-  size_type last_row_end = 0;
-  while (h_batch_row_offsets.size() < num_batches) {
-    // subtract out the size of the last row in the previous batch
-    auto adjusted_row_size =
-      thrust::make_transform_iterator(cumulative_row_size + last_row_end,
-                                      [last_row_end, cumulative_row_size] __device__(size_t size) {
-                                        return size - cumulative_row_size[last_row_end];
-                                      });
-    // find the next max_batch_size boundary
-    size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream),
-                                                    adjusted_row_size,
-                                                    adjusted_row_size + (num_rows - last_row_end),
-                                                    max_batch_size) -
-                                adjusted_row_size) +
-                               last_row_end) -
-                              1;
-
-    h_batch_row_offsets.push_back(row_end);
-    last_row_end = row_end;
-  }
-  printf("batches: ");
-  for (uint i = 0; i < h_batch_row_offsets.size(); ++i) {
-    printf("%d ", h_batch_row_offsets[i]);
-  }
-  printf("\n");
-}
-
-int compute_block_counts(device_uvector<size_type> const& batch_row_offsets,
-                         int desired_window_height,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
-{
-  size_type const num_batches = batch_row_offsets.size() - 1;
-  device_uvector<size_type> num_blocks(num_batches, stream);
-  auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    iter,
-    iter + num_batches,
-    num_blocks.begin(),
-    [desired_window_height,
-     batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
-      return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) /
-             desired_window_height;
-    });
-  return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
-}
-
-size_type block_lambda(
-  block_info* blocks,
-  device_uvector<size_type> const& batch_row_offsets,  // comes from build_batches
-  int column_start,
-  int column_end,
-  int desired_window_height,
-  int total_number_of_rows,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  size_type const num_batches = batch_row_offsets.size() - 1;
-  device_uvector<size_type> num_blocks(num_batches, stream);
-  auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    iter,
-    iter + num_batches,
-    num_blocks.begin(),
-    [=, batch_row_offsets = batch_row_offsets.data()] __device__(int batch_index) -> size_type {
-      return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) /
-             desired_window_height;
-    });
-  size_type const total_blocks =
-    thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
-  device_uvector<size_type> block_starts(num_batches, stream);
-  thrust::exclusive_scan(rmm::exec_policy(stream),
-                         num_blocks.begin(),
-                         num_blocks.end(),
-                         block_starts.begin());  // in blocks
-
-  thrust::for_each(
-    rmm::exec_policy(stream),
-    iter,
-    iter + total_blocks,
-    [                  =,
-     block_starts      = block_starts.data(),
-     batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) {
-      block_info& bi = blocks[block_index];
-
-      // what batch this block falls in
-      auto const batch_index_iter =
-        thrust::lower_bound(thrust::seq, block_starts, block_starts + num_batches, block_index);
-      auto const batch_index = batch_index_iter == block_starts ? 0 : *batch_index_iter;
-      // local index within the block
-      int const local_block_index = block_index - block_starts[batch_index];
-      // the start row for this batch.
-      int const batch_row_start = batch_row_offsets[batch_index];
-      // the start row for this block
-      int const block_row_start = batch_row_start + (local_block_index * desired_window_height);
-      // the end row for this block
-      int const max_row = std::min(total_number_of_rows,
-                                   batch_index + 1 > num_batches
-                                     ? std::numeric_limits<int>::max()
-                                     : static_cast<int>(batch_row_offsets[batch_index + 1]));
-      int const block_row_end =
-        std::min(batch_row_start + ((local_block_index + 1) * desired_window_height) - 1,
-                 total_number_of_rows);
-
-      // stuff the block
-      bi.start_col  = column_start;
-      bi.end_col    = column_end;
-      bi.start_row  = block_row_start;
-      bi.end_row    = block_row_end;
-      bi.buffer_num = batch_index;
-    });
-
-  return total_blocks;
-}
-
-void test_block_lambda(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
-{
-  device_uvector<size_type> batch_row_offsets(3, stream);
-  batch_row_offsets.set_element(0, 0, stream);
-  batch_row_offsets.set_element(1, 2000, stream);
-  batch_row_offsets.set_element(2, 5000, stream);
-
-  // three groups of columns that can hold 128, 1024, and 768 rows each.
-  auto const total_blocks = compute_block_counts(batch_row_offsets, 128, stream, mr) +
-                            compute_block_counts(batch_row_offsets, 1024, stream, mr) +
-                            compute_block_counts(batch_row_offsets, 768, stream, mr);
-
-  auto const table_num_rows = 50 * 1024;
-
-  // allocate memory for all blocks
-  device_uvector<block_info> blocks(total_blocks, stream);
-
-  auto used_blocks =
-    block_lambda(blocks.data(), batch_row_offsets, 0, 15, 128, table_num_rows, stream, mr);
-  used_blocks += block_lambda(
-    blocks.data() + used_blocks, batch_row_offsets, 16, 28, 1024, table_num_rows, stream, mr);
-  used_blocks += block_lambda(
-    blocks.data() + used_blocks, batch_row_offsets, 29, 32, 768, table_num_rows, stream, mr);
-
-  CUDF_EXPECTS(used_blocks == total_blocks, "used not equal to total!");
-
-  for (int i = 0; i < total_blocks; ++i) {
-    auto const block = blocks.element(i, stream);
-    printf(
-      "%d: %d,%d -> %d,%d\n", i, block.start_col, block.start_row, block.end_col, block.end_row);
-  }
-}
-
 std::vector<block_info> build_block_infos(std::vector<size_type> const& column_sizes,
                                           std::vector<size_type> const& column_starts,
                                           std::vector<row_batch> const& row_batches,
@@ -2415,12 +2248,6 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
   cudf::size_type num_columns = schema.size();
   cudf::size_type num_rows    = input.parent().size();
 
-  auto cumulative_row_size = cudf::detail::make_counting_transform_iterator(
-    0, [] __device__(size_t row_index) { return 300 * row_index; });
-  detail::build_batches(1024 * 1024, 1024, cumulative_row_size, stream, mr);
-
-  detail::test_block_lambda(stream, mr);
-
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem;
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index c64a61b3373..481787c6004 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -21,6 +21,8 @@
 #include <tuple>
 
 #include <cooperative_groups.h>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
@@ -42,6 +44,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -54,6 +58,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 #endif
 
 using cudf::detail::make_device_uvector_async;
+using rmm::device_uvector;
 namespace cudf {
 
 namespace detail {
@@ -885,8 +890,6 @@ __global__ void copy_validity_to_columns(
           if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
-            auto const write_5006_offset = 837; // validity_data_col_length * (65 - block_start_col)
-                                                // + (5006 - block_start_row)/8;
 
             if (rows_left <= 8) {
               // write byte
@@ -1330,28 +1333,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
                                       });
 
   size_type fixed_width_size_per_row =
-      detail::compute_column_information(iter, iter + num_columns, column_starts,
-                                         column_sizes); //,
-  //    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
-  /*  size_type fixed_width_size_per_row = 0;
-    for (int col = 0; col < num_columns; ++col) {
-      auto cv          = tbl.column(col);
-      auto col_type    = cv.type();
-      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-      if (nested_type) { variable_width_columns.push_back(cv); }
-
-      // a list or string column will write a single uint64
-      // of data here for offset/length
-      auto col_size = nested_type ? 8 : size_of(col_type);
-
-      // align size for this type
-      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-      column_starts.push_back(fixed_width_size_per_row);
-      column_sizes.push_back(col_size);
-      fixed_width_size_per_row += col_size;
-    }*/
+      detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
 
   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
@@ -1368,29 +1350,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     // will be included in the variable-width data blob at the end of the
     // row.
     return 0;
-    /*      auto c = variable_width_columns[col];
-            while (true) {
-              auto col_offsets   = c.child(0).data<size_type>();
-              auto col_data_size = size_of(c.child(1).type());
-              std::size_t alignment_needed  = col_data_size;
-
-            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-            if (c.num_children() == 0) {
-              break;
-            }
-            c = c.child(1);
-          }
-          exclusive_scan([t](int row_index) {
-            size_type total_row_size = 0;
-            for (int i=0 i<t.num_columns(); ++i) {
-              // compute data prior to validity
-              data_size += compute_type_size();
-              // compute validity size
-              total_row_size += num_columns() / 8;
-              total_row_size = align(data_size + bit_size + variable_size);
-            }
-          }
-    */
   };
 
   uint64_t row_batch_size = 0;

From 76e4099a9e4dc655268ec42a663ec7f825d20a97 Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Wed, 6 Oct 2021 14:43:18 -0700
Subject: [PATCH 21/80] Use the new row<->col method

Added a new method `convertFromRowsFixedWidthOptimized`
and `convertToRowsFixedWidthOptimized` to be used for when columns
are < 100. Otherwise use the new method

This is currently failing simple tests
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 33 +++++++++++
 java/src/main/native/src/TableJni.cpp         | 56 ++++++++++++++++++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 43 +++++++++++++-
 3 files changed, 128 insertions(+), 4 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 0af02d1c926..65c8fcc2c0d 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -627,8 +627,12 @@ private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long left
 
   private static native long[] convertToRows(long nativeHandle);
 
+  private static native long[] convertToRowsFixedWidthOptimized(long nativeHandle);
+
   private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale);
 
+  private static native long[] convertFromRowsFixedWidthOptimized(long nativeColumnView, int[] types, int[] scale);
+
   private static native long[] repeatStaticCount(long tableHandle, int count);
 
   private static native long[] repeatColumnCount(long tableHandle,
@@ -2684,6 +2688,15 @@ public ColumnVector[] convertToRows() {
     return ret;
   }
 
+  public ColumnVector[] convertToRowsFixedWidthOptimized() {
+    long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle);
+    ColumnVector[] ret = new ColumnVector[ptrs.length];
+    for (int i = 0; i < ptrs.length; i++) {
+      ret[i] = new ColumnVector(ptrs[i]);
+    }
+    return ret;
+  }
+
   /**
    * Convert a column of list of bytes that is formatted like the output from `convertToRows`
    * and convert it back to a table.
@@ -2704,6 +2717,26 @@ public static Table convertFromRows(ColumnView vec, DType ... schema) {
     return new Table(convertFromRows(vec.getNativeView(), types, scale));
   }
 
+  /**
+   * Convert a column of list of bytes that is formatted like the output from `convertToRows`
+   * and convert it back to a table.
+   * @param vec the row data to process.
+   * @param schema the types of each column.
+   * @return the parsed table.
+   */
+  public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) {
+    // TODO at some point we need a schema that support nesting so we can support nested types
+    // TODO we will need scale at some point very soon too
+    int[] types = new int[schema.length];
+    int[] scale = new int[schema.length];
+    for (int i = 0; i < schema.length; i++) {
+      types[i] = schema[i].typeId.nativeId;
+      scale[i] = schema[i].getScale();
+
+    }
+    return new Table(convertFromRowsFixedWidthOptimized(vec.getNativeView(), types, scale));
+  }
+
   /**
    * Construct a table from a packed representation.
    * @param metadata host-based metadata for the table
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index ee75112a2ed..cdd0623eb77 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -35,6 +35,7 @@
 #include <cudf/replace.hpp>
 #include <cudf/reshape.hpp>
 #include <cudf/rolling.hpp>
+#include <cudf/row_conversion.hpp>
 #include <cudf/search.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
@@ -597,16 +598,20 @@ class native_arrow_ipc_reader_handle final {
 static jlongArray
 convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
                          std::vector<std::unique_ptr<cudf::column>> &extra_columns) {
+  std::cout << "entering convert_table_for_return\n";
   std::vector<std::unique_ptr<cudf::column>> ret = table_result->release();
   int table_cols = ret.size();
   int num_columns = table_cols + extra_columns.size();
   cudf::jni::native_jlongArray outcol_handles(env, num_columns);
+  std::cout << "0\n";
   for (int i = 0; i < table_cols; i++) {
     outcol_handles[i] = reinterpret_cast<jlong>(ret[i].release());
   }
+  std::cout << "1\n";
   for (size_t i = 0; i < extra_columns.size(); i++) {
     outcol_handles[i + table_cols] = reinterpret_cast<jlong>(extra_columns[i].release());
   }
+  std::cout << "exiting convert_table_for_return\n";
   return outcol_handles.get_jArray();
 }
 
@@ -2692,14 +2697,35 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
+                                                                     jlong input_table) {
+  JNI_NULL_CHECK(env, input_table, "input table is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::old_convert_to_rows(*n_input_table);
+    int num_columns = cols.size();
+    cudf::jni::native_jlongArray outcol_handles(env, num_columns);
+    for (int i = 0; i < num_columns; i++) {
+      outcol_handles[i] = reinterpret_cast<jlong>(cols[i].release());
+    }
+    return outcol_handles.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass,
                                                                      jlong input_table) {
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
 
   try {
+    std::cout << "convert_to_rows\n";
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::java::convert_to_rows(*n_input_table);
+    std::cout << "before convert_to_rows\n";
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::convert_to_rows(*n_input_table);
+    std::cout << "after convert_to_rows\n";
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2710,6 +2736,29 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(JNIEnv *env, jclass,
+                                                                       jlong input_column,
+                                                                       jintArray types,
+                                                                       jintArray scale) {
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  JNI_NULL_CHECK(env, types, "types is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::lists_column_view list_input(*input);
+    cudf::jni::native_jintArray n_types(env, types);
+    cudf::jni::native_jintArray n_scale(env, scale);
+    std::vector<cudf::data_type> types_vec;
+    for (int i = 0; i < n_types.size(); i++) {
+      types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
+    }
+    std::unique_ptr<cudf::table> result = cudf::old_convert_from_rows(list_input, types_vec);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass,
                                                                        jlong input_column,
                                                                        jintArray types,
@@ -2718,6 +2767,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
   JNI_NULL_CHECK(env, types, "types is null", 0);
 
   try {
+    std::cout << "convert_from_rows\n";
     cudf::jni::auto_set_device(env);
     cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
     cudf::lists_column_view list_input(*input);
@@ -2727,7 +2777,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::unique_ptr<cudf::table> result = cudf::java::convert_from_rows(list_input, types_vec);
+    std::cout << "before convert_from_rows\n";
+    std::unique_ptr<cudf::table> result = cudf::convert_from_rows(list_input, types_vec);
+    std::cout << "after convert_from_rows\n";
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index aa9ef5bf766..4ddeb542bbf 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -51,6 +51,7 @@
 import java.nio.file.Files;
 import java.util.*;
 import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 import static ai.rapids.cudf.ParquetColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
@@ -7093,6 +7094,44 @@ void testStructColumnFilterStrings() {
     }
   }
 
+  @Test
+  void fixedWidthRowsRoundTripWide() {
+    TestBuilder tb = new TestBuilder();
+    IntStream.range(0, 10).forEach(i -> tb.column(3l, 9l, 4l, 2l, 20l, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(5, 1, 0, 2, 7, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(true, false, false, true, false, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(new Byte[]{2, 3, 4, 5, 9, null}));
+    IntStream.range(0, 10).forEach(i -> tb.decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d,
+        9.5d, 0.9d, 7.23d, 2.8d, null));
+    IntStream.range(0, 10).forEach(i -> tb.decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null));
+    try (Table t = tb.build()) {
+      ColumnVector[] rows = t.convertToRows();
+      try {
+        // We didn't overflow
+        assert rows.length == 1;
+        ColumnVector cv = rows[0];
+        assert cv.getRowCount() == t.getRowCount();
+//        try (HostColumnVector hcv = cv.copyToHost()) {
+//          hcv.getChildColumnView(0).getDataBuffer().printBuffer(8);
+//        }
+
+        DType[] types = new DType[t.getNumberOfColumns()];
+        for (int i = 0; i < t.getNumberOfColumns(); i++) {
+          types[i] = t.getColumn(i).getType();
+        }
+        try (Table backAgain = Table.convertFromRows(cv, types)) {
+          assertTablesAreEqual(t, backAgain);
+        }
+      } finally {
+        for (ColumnVector cv : rows) {
+          cv.close();
+        }
+      }
+    }
+  }
+
   @Test
   void fixedWidthRowsRoundTrip() {
     try (Table t = new TestBuilder()
@@ -7105,7 +7144,7 @@ void fixedWidthRowsRoundTrip() {
         .decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)
         .decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null)
         .build()) {
-      ColumnVector[] rows = t.convertToRows();
+      ColumnVector[] rows = t.convertToRowsFixedWidthOptimized();
       try {
         // We didn't overflow
         assert rows.length == 1;
@@ -7119,7 +7158,7 @@ void fixedWidthRowsRoundTrip() {
         for (int i = 0; i < t.getNumberOfColumns(); i++) {
           types[i] = t.getColumn(i).getType();
         }
-        try (Table backAgain = Table.convertFromRows(cv, types)) {
+        try (Table backAgain = Table.convertFromRowsFixedWidthOptimized(cv, types)) {
           assertTablesAreEqual(t, backAgain);
         }
       } finally {

From 966c34ce57c17ee924d91cb2f83f3468c2c43833 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 7 Oct 2021 04:01:36 +0000
Subject: [PATCH 22/80] Fixing issue Raza found with 8-byte data

---
 cpp/src/row_conversion/row_conversion.cu    |  27 +++--
 cpp/tests/row_conversion/row_conversion.cpp | 122 ++++++++++++++++----
 java/src/main/native/src/row_conversion.cu  |  23 ++--
 3 files changed, 132 insertions(+), 40 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 9674000a69d..84fab20fce5 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -333,9 +333,9 @@ struct block_info {
   int end_row;
   int buffer_num;
 
-  __host__ __device__ size_type get_row_size(size_type const* const col_offsets,
-                                             size_type const* const col_sizes,
-                                             bool debug_print = false) const
+  __host__ __device__ size_type get_shared_row_size(size_type const* const col_offsets,
+                                                    size_type const* const col_sizes,
+                                                    bool debug_print = false) const
   {
     if (debug_print)
       printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n",
@@ -350,6 +350,14 @@ struct block_info {
              col_offsets[start_col]);
     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
   }
+  __host__ __device__ size_type get_dest_row_size(size_type const* const col_offsets,
+                                                  size_type const* const col_sizes,
+                                                  bool debug_print = false) const
+  {
+    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] +
+                          util::div_rounding_up_unsafe(num_cols(), 8),
+                        8);
+  }
   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
 
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
@@ -456,7 +464,7 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const num_fetch_cols         = fetch_block.num_cols();
       auto const num_fetch_rows         = fetch_block.num_rows();
       auto const num_elements_in_block  = num_fetch_cols * num_fetch_rows;
-      auto const fetch_block_row_size   = fetch_block.get_row_size(col_offsets, col_sizes);
+      auto const fetch_block_row_size   = fetch_block.get_shared_row_size(col_offsets, col_sizes);
       auto const starting_column_offset = col_offsets[fetch_block.start_col];
       auto& fetch_barrier               = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
 
@@ -513,7 +521,8 @@ __global__ void copy_from_columns(const size_type num_rows,
 
     /*    auto const rows_in_block  = block.num_rows();
         auto const cols_in_block  = block.num_cols();*/
-    auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
+    auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
+    auto const dest_row_size  = block.get_dest_row_size(col_offsets, col_sizes);
     auto const column_offset  = col_offsets[block.start_col];
 
     // copy entire rows to final dest
@@ -521,7 +530,7 @@ __global__ void copy_from_columns(const size_type num_rows,
          absolute_row += blockDim.x) {
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
-        output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
+        output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset;
       if (debug_print)
         printf("processing row %d\noutput data[%d] is address %p\n",
                absolute_row,
@@ -918,8 +927,8 @@ static __device__ void fetch_blocks_for_row_to_column(
     auto const fetch_block_end_row   = fetch_block.end_row;
     auto const starting_col_offset   = col_offsets[fetch_block.start_col];
 
-    auto const fetch_block_row_size         = fetch_block.get_row_size(col_offsets, col_sizes);
-    auto const num_fetch_cols               = fetch_block.num_cols();
+    auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
+    auto const num_fetch_cols       = fetch_block.num_cols();
     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
       sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
     auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
@@ -1115,7 +1124,7 @@ __global__ void copy_to_columns(const size_type num_rows,
 
     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
 
-    auto block_row_size = block.get_row_size(_col_offsets, _col_sizes, debug_print);
+    auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes, debug_print);
 
     // now we copy from shared memory to final destination.
     // the data is laid out in rows in shared memory, so the reads
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 48d9690d583..0ab8b70a0f7 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -46,9 +46,9 @@ TEST_F(ColumnToRowTests, Single)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Simple)
@@ -68,9 +68,9 @@ TEST_F(ColumnToRowTests, Simple)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Tall)
@@ -93,9 +93,9 @@ TEST_F(ColumnToRowTests, Tall)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Wide)
@@ -122,9 +122,9 @@ TEST_F(ColumnToRowTests, Wide)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, SingleByteWide)
@@ -153,9 +153,9 @@ TEST_F(ColumnToRowTests, SingleByteWide)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Non2Power)
@@ -191,9 +191,9 @@ TEST_F(ColumnToRowTests, Non2Power)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Big)
@@ -218,9 +218,21 @@ TEST_F(ColumnToRowTests, Big)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
+
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Bigger)
@@ -245,9 +257,20 @@ TEST_F(ColumnToRowTests, Bigger)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
+
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Biggest)
@@ -272,9 +295,20 @@ TEST_F(ColumnToRowTests, Biggest)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(RowToColumnTests, Single)
@@ -379,6 +413,46 @@ TEST_F(RowToColumnTests, SingleByteWide)
   }
 }
 
+TEST_F(RowToColumnTests, Raza)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT64},
+                                      cudf::data_type{cudf::type_id::FLOAT64},
+                                      cudf::data_type{cudf::type_id::INT8},
+                                      cudf::data_type{cudf::type_id::BOOL8},
+                                      cudf::data_type{cudf::type_id::FLOAT32},
+                                      cudf::data_type{cudf::type_id::INT8},
+                                      cudf::data_type{cudf::type_id::INT32},
+                                      cudf::data_type{cudf::type_id::INT64}};
+
+  cudf::test::fixed_width_column_wrapper<int64_t> c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<double> c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0},
+                                                    {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> c3({true, false, false, true, false, false},
+                                                  {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<float> c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f},
+                                                   {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_point_column_wrapper<int32_t> c6(
+    {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2});
+  cudf::test::fixed_point_column_wrapper<int64_t> c7(
+    {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1});
+
+  cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7});
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
 TEST_F(RowToColumnTests, Non2Power)
 {
   auto r =
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 481787c6004..1808c7534df 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -330,10 +330,18 @@ struct block_info {
   int end_row;
   int buffer_num;
 
-  __host__ __device__ size_type get_row_size(size_type const *const col_offsets,
-                                             size_type const *const col_sizes) const {
+  __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
+                                                    size_type const *const col_sizes,
+                                                    bool debug_print = false) const {
     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
   }
+  __host__ __device__ size_type get_dest_row_size(size_type const *const col_offsets,
+                                                  size_type const *const col_sizes,
+                                                  bool debug_print = false) const {
+    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] +
+                            util::div_rounding_up_unsafe(num_cols(), 8),
+                        8);
+  }
   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
 
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
@@ -409,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
       auto const num_fetch_cols = fetch_block.num_cols();
       auto const num_fetch_rows = fetch_block.num_rows();
       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
-      auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
+      auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
       auto const starting_column_offset = col_offsets[fetch_block.start_col];
       auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
 
@@ -448,7 +456,8 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
 
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
 
-    auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
+    auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
+    auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
 
     // copy entire rows to final dest
@@ -457,7 +466,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
 
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
-          output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
+          output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset;
       auto const shared_offset = block_row_size * relative_row;
 
       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
@@ -650,7 +659,7 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde
     auto const fetch_block_start_row = fetch_block.start_row;
     auto const fetch_block_end_row = fetch_block.end_row;
     auto const starting_col_offset = col_offsets[fetch_block.start_col];
-    auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
+    auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
     auto const num_fetch_cols = fetch_block.num_cols();
     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
         sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
@@ -766,7 +775,7 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
 
     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
 
-    auto block_row_size = block.get_row_size(_col_offsets, _col_sizes);
+    auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes);
 
     // now we copy from shared memory to final destination.
     // the data is laid out in rows in shared memory, so the reads

From 6452e8eb6137e2a1f31049ec0dc0add1d6947f9f Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 13 Oct 2021 22:06:04 +0000
Subject: [PATCH 23/80] fixing bug with float columns when 'enough' data was
 present. Updated function names

---
 .../row_conversion/row_conversion.cpp         |   8 +-
 cpp/include/cudf/row_conversion.hpp           |   4 +-
 cpp/src/row_conversion/row_conversion.cu      |  95 ++++---
 cpp/tests/row_conversion/row_conversion.cpp   | 245 ++++++++++++------
 java/src/main/native/src/TableJni.cpp         |  16 +-
 java/src/main/native/src/row_conversion.cu    |  66 ++---
 java/src/main/native/src/row_conversion.hpp   |  19 +-
 7 files changed, 265 insertions(+), 188 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 2fe436a22c1..fb8e4c8aef3 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -50,7 +50,7 @@ static void BM_old_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto rows = cudf::old_convert_to_rows(table->view());
+    auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -109,13 +109,13 @@ static void BM_old_from_row(benchmark::State& state)
     total_bytes += cudf::size_of(t);
   }
 
-  auto rows = cudf::old_convert_to_rows(table->view());
+  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
   cudf::lists_column_view const first_list(rows.front()->view());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto out = cudf::old_convert_from_rows(first_list, schema);
+    auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema);
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -144,7 +144,7 @@ static void BM_new_from_row(benchmark::State& state)
     total_bytes += cudf::size_of(t);
   }
 
-  auto rows = cudf::old_convert_to_rows(table->view());
+  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
   cudf::lists_column_view const first_list(rows.front()->view());
 
   for (auto _ : state) {
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index 8f82d01b06c..5d799f4c596 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -24,7 +24,7 @@
 
 namespace cudf {
 
-std::vector<std::unique_ptr<cudf::column>> old_convert_to_rows(
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
   cudf::table_view const& tbl,
   // TODO need something for validity
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
@@ -36,7 +36,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::table> old_convert_from_rows(
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
   cudf::lists_column_view const& input,
   std::vector<cudf::data_type> const& schema,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 84fab20fce5..0457bbf71e4 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -53,7 +53,7 @@
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS      = 8;
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS    = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS    = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED          = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL        = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
@@ -350,14 +350,6 @@ struct block_info {
              col_offsets[start_col]);
     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
   }
-  __host__ __device__ size_type get_dest_row_size(size_type const* const col_offsets,
-                                                  size_type const* const col_sizes,
-                                                  bool debug_print = false) const
-  {
-    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] +
-                          util::div_rounding_up_unsafe(num_cols(), 8),
-                        8);
-  }
   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
 
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
@@ -441,9 +433,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   //  else { return; }
 
   auto const blocks_remaining =
-    std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS),
-             std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
-                      (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+    std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS,
+             (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS);
 
   size_t fetch;
   size_t subset;
@@ -451,11 +442,11 @@ __global__ void copy_from_columns(const size_type num_rows,
     // Fetch ahead up to stages_count subsets
     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
       if (debug_print)
-        printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch);
-      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
+        printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch);
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch];
       if (debug_print)
         printf("block %lu rows %d-%d and cols %d-%d\n",
-               blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch,
+               blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch,
                fetch_block.start_row,
                fetch_block.end_row,
                fetch_block.start_col,
@@ -474,9 +465,9 @@ __global__ void copy_from_columns(const size_type num_rows,
       // to do the copy we need to do n column copies followed by m element copies OR
       // we have to do m element copies followed by r row copies. When going from column
       // to row it is much easier to copy by elements first otherwise we would need a running
-      // total of the column sizes for our block, which isn't readily available. This makes it more
-      // appealing to copy element-wise from input data into shared matching the end layout and do
-      // row-based memcopies out.
+      // total of the column sizes for our block, which isn't readily available. This makes it
+      // more appealing to copy element-wise from input data into shared matching the end layout
+      // and do row-based memcopies out.
 
       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
@@ -499,14 +490,15 @@ __global__ void copy_from_columns(const size_type num_rows,
         auto const input_src     = input_data[absolute_col] + col_size * absolute_row;
 
         if (debug_print)
-          printf("block %lu to shared chunk %lu. %p <- %p - %d bytes\n",
+          printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n",
                  fetch,
                  fetch % stages_count,
                  &shared[fetch % stages_count][shared_offset],
                  input_src,
+                 *input_src,
                  col_size);
 
-        // copy the element to global memory
+        // copy the element from global memory
         cuda::memcpy_async(
           &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier);
       }
@@ -515,14 +507,11 @@ __global__ void copy_from_columns(const size_type num_rows,
     auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     subset_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
     if (debug_print)
-      printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset);
+      printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset);
 
-    /*    auto const rows_in_block  = block.num_rows();
-        auto const cols_in_block  = block.num_cols();*/
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
-    auto const dest_row_size  = block.get_dest_row_size(col_offsets, col_sizes);
     auto const column_offset  = col_offsets[block.start_col];
 
     // copy entire rows to final dest
@@ -530,7 +519,7 @@ __global__ void copy_from_columns(const size_type num_rows,
          absolute_row += blockDim.x) {
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
-        output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset;
+        output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset;
       if (debug_print)
         printf("processing row %d\noutput data[%d] is address %p\n",
                absolute_row,
@@ -543,6 +532,7 @@ __global__ void copy_from_columns(const size_type num_rows,
                &shared[subset % stages_count][shared_offset],
                block_row_size,
                absolute_row);
+
       cuda::memcpy_async(
         output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier);
     }
@@ -673,7 +663,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
 
     if (print_debug)
       printf(
-        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, "
+        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, "
+        "blockDim.x=%d, "
         "warp size "
         "%d\n",
         threadIdx.x,
@@ -709,7 +700,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
 
       if (print_debug)
         printf(
-          "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d absolute)\n",
+          "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d "
+          "absolute)\n",
           participation_mask,
           relative_row,
           absolute_row,
@@ -744,8 +736,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             absolute_col);
 
         // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make
-        // the bytes we actually write.
+        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
+        // make the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
@@ -915,7 +907,8 @@ static __device__ void fetch_blocks_for_row_to_column(
       block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
     if (debug_print)
       printf(
-        "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending "
+        "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, "
+        "ending "
         "offset %p\n",
         blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
         total_blocks,
@@ -1242,7 +1235,8 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
            block_infos,
            blockIdx.x);
     printf(
-      "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, row "
+      "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, "
+      "row "
       "offsets are %p, block infos at %p\n",
       threadIdx.x,
       blockIdx.x,
@@ -1595,8 +1589,8 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   }
 
   // Now we need to add in space for validity
-  // Eventually we can think about nullable vs not nullable, but for now we will just always add it
-  // in
+  // Eventually we can think about nullable vs not nullable, but for now we will just always add
+  // it in
   int32_t validity_bytes_needed =
     (schema.size() + 7) / 8;  // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
   // validity comes at the end and is byte aligned so we can pack more in.
@@ -1727,11 +1721,11 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
   };
 
   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-  // would be memory cache line sized access, but since other blocks will read/write the edges this
-  // may not turn out to be overly important. For now, we will attempt to build a square window as
-  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
-  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
-  // bytes, not rows or columns.
+  // would be memory cache line sized access, but since other blocks will read/write the edges
+  // this may not turn out to be overly important. For now, we will attempt to build a square
+  // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
+  // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
+  // trick is that it's in bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
   int const window_height            = std::clamp(
     util::round_up_safe<int>(
@@ -1787,9 +1781,11 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
           calc_admin_data_size(col - current_window_start_col),
         shmem_limit_per_block);
       printf(
-        "Window size %d too large at column %d, admin size is %d, bumping back to build windows of "
+        "Window size %d too large at column %d, admin size is %d, bumping back to build windows "
+        "of "
         "size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is "
+        "%d) "
         "for shared mem size %d\n",
         row_size_with_end_pad * window_height,
         col,
@@ -1809,7 +1805,8 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
         detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
 #if defined(DEBUG)
       printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "New window starting with offset %d and row size %d to be %d (previous column offset "
+        "%d+%d "
         "or %d)\n",
         row_size,
         col_size,
@@ -2172,9 +2169,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 #endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::vector<std::unique_ptr<cudf::column>> old_convert_to_rows(cudf::table_view const& tbl,
-                                                               rmm::cuda_stream_view stream,
-                                                               rmm::mr::device_memory_resource* mr)
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
+  cudf::table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   const cudf::size_type num_columns = tbl.num_columns();
 
@@ -2399,10 +2395,11 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
 #endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::unique_ptr<cudf::table> old_convert_from_rows(cudf::lists_column_view const& input,
-                                                   std::vector<cudf::data_type> const& schema,
-                                                   rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   // verify that the types are what we expect
   cudf::column_view child = input.child();
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 0ab8b70a0f7..746ac0655f7 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -14,15 +14,21 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/row_conversion.hpp>
+#include <cudf/types.hpp>
+#include <cudf/wrappers/timestamps.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
-#include <cudf/row_conversion.hpp>
-#include "cudf/lists/lists_column_view.hpp"
-#include "cudf/types.hpp"
+#include <thrust/iterator/transform_iterator.h>
+
+#include <limits>
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {
 };
@@ -35,20 +41,17 @@ TEST_F(ColumnToRowTests, Single)
   cudf::table_view in(std::vector<cudf::column_view>{a});
   std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Simple)
@@ -57,20 +60,17 @@ TEST_F(ColumnToRowTests, Simple)
   cudf::table_view in(std::vector<cudf::column_view>{a});
   std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Tall)
@@ -81,21 +81,18 @@ TEST_F(ColumnToRowTests, Tall)
   cudf::table_view in(std::vector<cudf::column_view>{a});
   std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Wide)
@@ -111,20 +108,17 @@ TEST_F(ColumnToRowTests, Wide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, SingleByteWide)
@@ -141,21 +135,18 @@ TEST_F(ColumnToRowTests, SingleByteWide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Non2Power)
@@ -175,13 +166,14 @@ TEST_F(ColumnToRowTests, Non2Power)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
@@ -190,10 +182,6 @@ TEST_F(ColumnToRowTests, Non2Power)
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Big)
@@ -214,13 +202,14 @@ TEST_F(ColumnToRowTests, Big)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
@@ -229,10 +218,6 @@ TEST_F(ColumnToRowTests, Big)
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Bigger)
@@ -253,12 +238,13 @@ TEST_F(ColumnToRowTests, Bigger)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
@@ -267,10 +253,6 @@ TEST_F(ColumnToRowTests, Bigger)
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Biggest)
@@ -291,13 +273,14 @@ TEST_F(ColumnToRowTests, Biggest)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
@@ -306,9 +289,6 @@ TEST_F(ColumnToRowTests, Biggest)
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(RowToColumnTests, Single)
@@ -319,7 +299,8 @@ TEST_F(RowToColumnTests, Single)
   auto old_rows = cudf::convert_to_rows(in);
   std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -331,10 +312,11 @@ TEST_F(RowToColumnTests, Simple)
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -348,14 +330,15 @@ TEST_F(RowToColumnTests, Tall)
   cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -373,7 +356,7 @@ TEST_F(RowToColumnTests, Wide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
@@ -381,7 +364,8 @@ TEST_F(RowToColumnTests, Wide)
   }
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -399,21 +383,22 @@ TEST_F(RowToColumnTests, SingleByteWide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 }
 
-TEST_F(RowToColumnTests, Raza)
+TEST_F(RowToColumnTests, AllTypes)
 {
   std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
   std::vector<cudf::column_view> views;
@@ -442,11 +427,115 @@ TEST_F(RowToColumnTests, Raza)
 
   cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7});
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, AllTypesLarge)
+{
+  std::vector<cudf::column> cols;
+  std::vector<cudf::data_type> schema{};
+
+  // 10 columns of each type with 1024 entries
+  constexpr int num_rows{1024};
+
+  std::default_random_engine re;
+  std::uniform_real_distribution<double> rand_double(std::numeric_limits<double>::min(),
+                                                     std::numeric_limits<double>::max());
+  std::uniform_int_distribution<int64_t> rand_int64(std::numeric_limits<int64_t>::min(),
+                                                    std::numeric_limits<int64_t>::max());
+  auto r = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) -> int64_t { return rand_int64(re); });
+  auto d = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) -> double { return rand_double(re); });
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::INT8});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::INT16});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::FLOAT32});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::FLOAT64});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::BOOL8});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+         r, r + num_rows)
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>(
+         r, r + num_rows)
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_point_column_wrapper<int32_t>(r, r + num_rows, numeric::scale_type{-2})
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_point_column_wrapper<int64_t>(r, r + num_rows, numeric::scale_type{-1})
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64});
+  }
+
+  std::vector<cudf::column_view> views(cols.begin(), cols.end());
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -470,10 +559,11 @@ TEST_F(RowToColumnTests, Non2Power)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -498,10 +588,11 @@ TEST_F(RowToColumnTests, Big)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -526,10 +617,11 @@ TEST_F(RowToColumnTests, Bigger)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -554,10 +646,11 @@ TEST_F(RowToColumnTests, Biggest)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index cdd0623eb77..109ee571b7d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2697,14 +2697,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
-                                                                     jlong input_table) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, jlong input_table) {
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::old_convert_to_rows(*n_input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols =
+        cudf::convert_to_rows_fixed_width_optimized(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2736,10 +2737,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(JNIEnv *env, jclass,
-                                                                       jlong input_column,
-                                                                       jintArray types,
-                                                                       jintArray scale) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(
+    JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   JNI_NULL_CHECK(env, types, "types is null", 0);
 
@@ -2753,7 +2752,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::unique_ptr<cudf::table> result = cudf::old_convert_from_rows(list_input, types_vec);
+    std::unique_ptr<cudf::table> result =
+        cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 1808c7534df..e6cd9a9da32 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -21,8 +21,6 @@
 #include <tuple>
 
 #include <cooperative_groups.h>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
@@ -30,10 +28,12 @@
 #endif
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
@@ -51,7 +51,7 @@
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8;
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
@@ -331,17 +331,9 @@ struct block_info {
   int buffer_num;
 
   __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
-                                                    size_type const *const col_sizes,
-                                                    bool debug_print = false) const {
+                                                    size_type const *const col_sizes) const {
     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
   }
-  __host__ __device__ size_type get_dest_row_size(size_type const *const col_offsets,
-                                                  size_type const *const col_sizes,
-                                                  bool debug_print = false) const {
-    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] +
-                            util::div_rounding_up_unsafe(num_cols(), 8),
-                        8);
-  }
   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
 
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
@@ -404,16 +396,15 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
   group.sync();
 
   auto const blocks_remaining =
-      std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS),
-               std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
-                        (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+      std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS,
+               (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS);
 
   size_t fetch;
   size_t subset;
   for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
     // Fetch ahead up to stages_count subsets
     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
-      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch];
       auto const num_fetch_cols = fetch_block.num_cols();
       auto const num_fetch_rows = fetch_block.num_rows();
       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
@@ -429,9 +420,9 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
       // to do the copy we need to do n column copies followed by m element copies OR
       // we have to do m element copies followed by r row copies. When going from column
       // to row it is much easier to copy by elements first otherwise we would need a running
-      // total of the column sizes for our block, which isn't readily available. This makes it more
-      // appealing to copy element-wise from input data into shared matching the end layout and do
-      // row-based memcopies out.
+      // total of the column sizes for our block, which isn't readily available. This makes it
+      // more appealing to copy element-wise from input data into shared matching the end layout
+      // and do row-based memcopies out.
 
       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
@@ -445,7 +436,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
         auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
 
-        // copy the element to global memory
+        // copy the element from global memory
         cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size,
                            fetch_barrier);
       }
@@ -454,10 +445,8 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     subset_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
-    auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
 
     // copy entire rows to final dest
@@ -466,7 +455,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
 
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
-          output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset;
+          output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset;
       auto const shared_offset = block_row_size * relative_row;
 
       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
@@ -563,8 +552,8 @@ __global__ void copy_validity_from_columns(
             input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
 
         // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make
-        // the bytes we actually write.
+        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
+        // make the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
@@ -1085,8 +1074,8 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   }
 
   // Now we need to add in space for validity
-  // Eventually we can think about nullable vs not nullable, but for now we will just always add it
-  // in
+  // Eventually we can think about nullable vs not nullable, but for now we will just always add
+  // it in
   int32_t validity_bytes_needed =
       (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
   // validity comes at the end and is byte aligned so we can pack more in.
@@ -1209,11 +1198,11 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
   };
 
   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-  // would be memory cache line sized access, but since other blocks will read/write the edges this
-  // may not turn out to be overly important. For now, we will attempt to build a square window as
-  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
-  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
-  // bytes, not rows or columns.
+  // would be memory cache line sized access, but since other blocks will read/write the edges
+  // this may not turn out to be overly important. For now, we will attempt to build a square
+  // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
+  // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
+  // trick is that it's in bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
   int const window_height = std::clamp(
       util::round_up_safe<int>(
@@ -1478,8 +1467,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 }
 
 std::vector<std::unique_ptr<cudf::column>>
-old_convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource *mr) {
+convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource *mr) {
   const cudf::size_type num_columns = tbl.num_columns();
 
   std::vector<cudf::data_type> schema;
@@ -1656,10 +1645,9 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::unique_ptr<cudf::table> old_convert_from_rows(cudf::lists_column_view const &input,
-                                                   std::vector<cudf::data_type> const &schema,
-                                                   rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource *mr) {
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+    cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
+    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   // verify that the types are what we expect
   cudf::column_view child = input.child();
   cudf::type_id list_type = child.type().id();
diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
index 517202f3892..edc2768d4bb 100644
--- a/java/src/main/native/src/row_conversion.hpp
+++ b/java/src/main/native/src/row_conversion.hpp
@@ -25,11 +25,11 @@
 namespace cudf {
 namespace java {
 
-std::vector<std::unique_ptr<cudf::column>>
-old_convert_to_rows(cudf::table_view const &tbl,
-                    // TODO need something for validity
-                    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
-                    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
+    cudf::table_view const &tbl,
+    // TODO need something for validity
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 std::vector<std::unique_ptr<cudf::column>>
 convert_to_rows(cudf::table_view const &tbl,
@@ -37,11 +37,10 @@ convert_to_rows(cudf::table_view const &tbl,
                 rmm::cuda_stream_view stream = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::table>
-old_convert_from_rows(cudf::lists_column_view const &input,
-                      std::vector<cudf::data_type> const &schema,
-                      rmm::cuda_stream_view stream = rmm::cuda_stream_default,
-                      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+    cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table>
 convert_from_rows(cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,

From c0e989570bfdcbd51bb2abed0bed87a5c7f5cedd Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Fri, 15 Oct 2021 15:20:52 -0700
Subject: [PATCH 24/80] code cleanup and removed comments

---
 java/src/main/native/src/TableJni.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 109ee571b7d..d0e6b895a1e 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -598,20 +598,16 @@ class native_arrow_ipc_reader_handle final {
 static jlongArray
 convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
                          std::vector<std::unique_ptr<cudf::column>> &extra_columns) {
-  std::cout << "entering convert_table_for_return\n";
   std::vector<std::unique_ptr<cudf::column>> ret = table_result->release();
   int table_cols = ret.size();
   int num_columns = table_cols + extra_columns.size();
   cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-  std::cout << "0\n";
   for (int i = 0; i < table_cols; i++) {
     outcol_handles[i] = reinterpret_cast<jlong>(ret[i].release());
   }
-  std::cout << "1\n";
   for (size_t i = 0; i < extra_columns.size(); i++) {
     outcol_handles[i + table_cols] = reinterpret_cast<jlong>(extra_columns[i].release());
   }
-  std::cout << "exiting convert_table_for_return\n";
   return outcol_handles.get_jArray();
 }
 
@@ -2721,12 +2717,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
 
   try {
-    std::cout << "convert_to_rows\n";
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::cout << "before convert_to_rows\n";
     std::vector<std::unique_ptr<cudf::column>> cols = cudf::convert_to_rows(*n_input_table);
-    std::cout << "after convert_to_rows\n";
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2767,7 +2760,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
   JNI_NULL_CHECK(env, types, "types is null", 0);
 
   try {
-    std::cout << "convert_from_rows\n";
     cudf::jni::auto_set_device(env);
     cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
     cudf::lists_column_view list_input(*input);
@@ -2777,9 +2769,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::cout << "before convert_from_rows\n";
     std::unique_ptr<cudf::table> result = cudf::convert_from_rows(list_input, types_vec);
-    std::cout << "after convert_from_rows\n";
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);

From b9df725c892767f37a386b51b80c04a42da39bc7 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 21 Oct 2021 00:53:01 +0000
Subject: [PATCH 25/80] Fixing validity buffer alignment issue for row data

---
 cpp/src/row_conversion/row_conversion.cu    | 142 ++++++++++++--------
 cpp/tests/row_conversion/row_conversion.cpp |  63 ++++++---
 java/src/main/native/src/row_conversion.cu  |  58 +++++---
 3 files changed, 165 insertions(+), 98 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 0457bbf71e4..90bd8b88ef0 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -469,6 +469,7 @@ __global__ void copy_from_columns(const size_type num_rows,
       // more appealing to copy element-wise from input data into shared matching the end layout
       // and do row-based memcopies out.
 
+      auto const shared_buffer_base = shared[fetch % stages_count];
       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
         auto const relative_row = el % num_fetch_rows;
@@ -493,14 +494,36 @@ __global__ void copy_from_columns(const size_type num_rows,
           printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n",
                  fetch,
                  fetch % stages_count,
-                 &shared[fetch % stages_count][shared_offset],
+                 &shared_buffer_base[shared_offset],
                  input_src,
                  *input_src,
                  col_size);
 
         // copy the element from global memory
-        cuda::memcpy_async(
-          &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier);
+        switch (col_size) {
+          case 2:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset],
+                               input_src,
+                               cuda::aligned_size_t<2>(col_size),
+                               fetch_barrier);
+            break;
+          case 4:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset],
+                               input_src,
+                               cuda::aligned_size_t<4>(col_size),
+                               fetch_barrier);
+            break;
+          case 8:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset],
+                               input_src,
+                               cuda::aligned_size_t<8>(col_size),
+                               fetch_barrier);
+            break;
+          default:
+            cuda::memcpy_async(
+              &shared_buffer_base[shared_offset], input_src, col_size, fetch_barrier);
+            break;
+        }
       }
     }
 
@@ -511,15 +534,15 @@ __global__ void copy_from_columns(const size_type num_rows,
     if (debug_print)
       printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset);
 
-    auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
-    auto const column_offset  = col_offsets[block.start_col];
+    auto const block_row_size      = block.get_shared_row_size(col_offsets, col_sizes);
+    auto const column_offset       = col_offsets[block.start_col];
+    auto const block_output_buffer = output_data[block.buffer_num];
 
     // copy entire rows to final dest
     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
          absolute_row += blockDim.x) {
       auto const relative_row = absolute_row - block.start_row;
-      auto const output_dest =
-        output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset;
+      auto const output_dest  = block_output_buffer + row_offsets[absolute_row] + column_offset;
       if (debug_print)
         printf("processing row %d\noutput data[%d] is address %p\n",
                absolute_row,
@@ -533,8 +556,10 @@ __global__ void copy_from_columns(const size_type num_rows,
                block_row_size,
                absolute_row);
 
-      cuda::memcpy_async(
-        output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier);
+      cuda::memcpy_async(output_dest,
+                         &shared[subset % stages_count][shared_offset],
+                         cuda::aligned_size_t<8>(block_row_size),
+                         subset_barrier);
     }
   }
 
@@ -641,8 +666,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
     auto const num_block_cols = block.num_cols();
     auto const num_block_rows = block.num_rows();
 
-    auto const num_sections_x = (num_block_cols + 31) / 32;
-    auto const num_sections_y = (num_block_rows + 7) / 8;
+    auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
+    auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
     auto const validity_data_row_length =
       align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
@@ -690,7 +715,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
                my_section_idx,
                total_sections);
       auto const relative_col = section_x * 32 + lane_id;
-      auto const relative_row = section_y * 8;
+      auto const relative_row = section_y * 32;
       auto const absolute_col = relative_col + block.start_col;
       auto const absolute_row = relative_row + block.start_row;
       auto const cols_left    = num_columns - absolute_col;
@@ -720,15 +745,15 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             absolute_row,
             relative_col,
             absolute_col);
-        auto my_byte =
-          input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
+        auto my_data = input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32]
+                                                         : std::numeric_limits<uint32_t>::max();
 
         if (print_debug)
           printf(
-            "thread %d's byte is 0x%x, participation mask is 0x%x for relative row %d(%d real), "
+            "thread %d's bytes are 0x%x, participation mask is 0x%x for relative row %d(%d real), "
             "relative col %d(%d absolute)\n",
             threadIdx.x,
-            my_byte & 0xFF,
+            my_data,
             participation_mask,
             relative_row,
             absolute_row,
@@ -738,8 +763,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
         // every thread that is participating in the warp has a byte, but it's column-based
         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
         // make the bytes we actually write.
-        for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
-          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+        bitmask_type dw_mask = 1;
+        for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
           // lead thread in each warp writes data
           auto const validity_write_offset =
             validity_data_row_length * (relative_row + i) + relative_col / 8;
@@ -750,8 +776,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
                 "0x%x\n",
                 threadIdx.x,
                 blockIdx.x,
-                byte_mask,
-                my_byte & byte_mask,
+                dw_mask,
+                my_data & dw_mask,
                 validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
                 validity_write_offset,
                 validity_data);
@@ -804,6 +830,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
     // make sure entire block has finished copy
     group.sync();
 
+    auto const output_data_base =
+      output_data[block.buffer_num] + validity_offset + block.start_col / 8;
+
     // now async memcpy the shared memory out to the final destination
     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
       auto const relative_row = row - block.start_row;
@@ -835,9 +864,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             word_index(block.start_col),
           this_shared_block[validity_data_row_length * relative_row]);
       }
-      auto const output_ptr =
-        output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
-      auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+      auto const output_ptr = output_data_base + row_offsets[row];
+      auto const num_bytes  = util::div_rounding_up_unsafe(num_block_cols, 8);
 
       cuda::memcpy_async(
         output_ptr,
@@ -970,11 +998,20 @@ static __device__ void fetch_blocks_for_row_to_column(
          row += blockDim.x) {
       auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
       if (debug_print)
-        printf("fetching block %lu to shared chunk %lu. %p <- %p\n",
-               fetch_index,
-               fetch_index % max_resident_blocks,
-               &shared[fetch_index % max_resident_blocks][shared_offset],
-               &input_data[row_offsets[row] + starting_col_offset]);
+        printf(
+          "%d - fetching block %lu to shared chunk %lu. %p(shared[%d %% %d][%d]) <- %p(row %d, row "
+          "offset %d starting col offset %d)\n",
+          threadIdx.x,
+          fetch_index,
+          fetch_index % max_resident_blocks,
+          &shared[fetch_index % max_resident_blocks][shared_offset],
+          (int)fetch_index,
+          max_resident_blocks,
+          shared_offset,
+          &input_data[row_offsets[row] + starting_col_offset],
+          row,
+          row_offsets[row],
+          starting_col_offset);
       // copy the main
       cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
                          &input_data[row_offsets[row] + starting_col_offset],
@@ -1021,7 +1058,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
   // to shared memory for each of the blocks that we work on
 
-  /*constexpr*/ bool debug_print  = false;  // threadIdx.x == 0 && blockIdx.x == 0;
+  constexpr bool debug_print      = false;  // threadIdx.x == 2 && blockIdx.x == 0;
   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
   auto group                      = cooperative_groups::this_thread_block();
   extern __shared__ int8_t shared_data[];
@@ -1094,12 +1131,12 @@ __global__ void copy_to_columns(const size_type num_rows,
 
     auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     // ensure our data is ready
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+    if (debug_print)
       printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
     subset_barrier.arrive_and_wait();
 
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+    if (debug_print)
       printf("%d-%d reading block %lu at address %p\n",
              threadIdx.x,
              blockIdx.x,
@@ -1159,19 +1196,19 @@ __global__ void copy_to_columns(const size_type num_rows,
 
       if (debug_print) {
         printf(
-          "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, "
-          "shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
-          " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n",
-          relative_col,
-          relative_row,
-          absolute_col,
-          absolute_row,
-          shared_memory_row_offset,
-          shared_memory_offset,
-          column_size,
-          shmem_src,
-          dst/*,
-          *reinterpret_cast<uint32_t*>(shmem_src)*/);
+           "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, "
+           "shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
+           " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n",
+           relative_col,
+           relative_row,
+           absolute_col,
+           absolute_row,
+           shared_memory_row_offset,
+           shared_memory_offset,
+           column_size,
+           shmem_src,
+           dst/*,
+           *reinterpret_cast<uint32_t*>(shmem_src)*/);
         printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size);
       }
       if (debug_print && absolute_col == 0 && absolute_row == 51) {
@@ -1185,7 +1222,7 @@ __global__ void copy_to_columns(const size_type num_rows,
       cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
     }
     group.sync();
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+    if (debug_print)
       printf(
         "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
   }
@@ -1224,9 +1261,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-  bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
-  // bool print_debug = false;
-  //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
+  constexpr bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
   if (print_debug) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
     printf("%d %d - block infos are at %p and my index is %d\n",
@@ -1246,10 +1281,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       output_nm,
       row_offsets,
       block_infos);
-    /*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-    printf("%d: %d\n", i, row_offsets[i]);
-    }*/
   }
   // else { return; }
 
@@ -1407,14 +1438,15 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       if (print_debug)
-        printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
+        printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier); - 0x%x\n",
                threadIdx.x,
                blockIdx.x,
                col,
                starting_address,
                word_index(block_start_row),
                &this_shared_block[validity_data_col_length * relative_col],
-               words_to_copy * 4);
+               words_to_copy * 4,
+               this_shared_block[validity_data_col_length * relative_col]);
       cuda::memcpy_async(
         output_nm[col] + word_index(block_start_row),
         &this_shared_block[validity_data_col_length * relative_col],
@@ -1627,7 +1659,7 @@ static size_type compute_column_information(iterator begin,
     fixed_width_size_per_row += col_size;
   }
 
-  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  auto validity_offset = fixed_width_size_per_row;
   column_starts.push_back(validity_offset);
 
   return fixed_width_size_per_row;
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 746ac0655f7..b807b5cec81 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -457,46 +457,65 @@ TEST_F(RowToColumnTests, AllTypesLarge)
   auto d = cudf::detail::make_counting_transform_iterator(
     0, [&](auto i) -> double { return rand_double(re); });
 
+  auto all_valid  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+  auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
+  auto most_valid = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; });
+  auto few_valid = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; });
+
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows, all_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::INT8});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows, few_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::INT16});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows).release().release());
+    if (i < 5) {
+      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, few_valid)
+                        .release()
+                        .release());
+    } else {
+      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, none_valid)
+                        .release()
+                        .release());
+    }
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows, most_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::FLOAT32});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows, most_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::FLOAT64});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows, few_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::BOOL8});
   }
 
   for (int i = 0; i < 10; ++i) {
     cols.push_back(
       *cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
-         r, r + num_rows)
+         r, r + num_rows, all_valid)
          .release()
          .release());
     schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
@@ -505,25 +524,25 @@ TEST_F(RowToColumnTests, AllTypesLarge)
   for (int i = 0; i < 10; ++i) {
     cols.push_back(
       *cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>(
-         r, r + num_rows)
+         r, r + num_rows, most_valid)
          .release()
          .release());
     schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_point_column_wrapper<int32_t>(r, r + num_rows, numeric::scale_type{-2})
-         .release()
-         .release());
+    cols.push_back(*cudf::test::fixed_point_column_wrapper<int32_t>(
+                      r, r + num_rows, all_valid, numeric::scale_type{-2})
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_point_column_wrapper<int64_t>(r, r + num_rows, numeric::scale_type{-1})
-         .release()
-         .release());
+    cols.push_back(*cudf::test::fixed_point_column_wrapper<int64_t>(
+                      r, r + num_rows, most_valid, numeric::scale_type{-1})
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64});
   }
 
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index e6cd9a9da32..a67589fbaec 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -424,6 +424,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
       // more appealing to copy element-wise from input data into shared matching the end layout
       // and do row-based memcopies out.
 
+      auto const shared_buffer_base = shared[fetch % stages_count];
       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
         auto const relative_row = el % num_fetch_rows;
@@ -437,8 +438,24 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
 
         // copy the element from global memory
-        cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size,
-                           fetch_barrier);
+        switch (col_size) {
+          case 2:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                               cuda::aligned_size_t<2>(col_size), fetch_barrier);
+            break;
+          case 4:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                               cuda::aligned_size_t<4>(col_size), fetch_barrier);
+            break;
+          case 8:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                               cuda::aligned_size_t<8>(col_size), fetch_barrier);
+            break;
+          default:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size,
+                               fetch_barrier);
+            break;
+        }
       }
     }
 
@@ -448,18 +465,17 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
+    auto const block_output_buffer = output_data[block.buffer_num];
 
     // copy entire rows to final dest
     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
          absolute_row += blockDim.x) {
-
       auto const relative_row = absolute_row - block.start_row;
-      auto const output_dest =
-          output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset;
+      auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset;
       auto const shared_offset = block_row_size * relative_row;
 
-      cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
-                         subset_barrier);
+      cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset],
+                         cuda::aligned_size_t<8>(block_row_size), subset_barrier);
     }
   }
 
@@ -523,8 +539,8 @@ __global__ void copy_validity_from_columns(
     auto const num_block_cols = block.num_cols();
     auto const num_block_rows = block.num_rows();
 
-    auto const num_sections_x = (num_block_cols + 31) / 32;
-    auto const num_sections_y = (num_block_rows + 7) / 8;
+    auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
+    auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
     auto const validity_data_row_length =
         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
@@ -536,26 +552,27 @@ __global__ void copy_validity_from_columns(
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
          my_section_idx += warps_per_block) {
-
       // convert to rows and cols
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
       auto const relative_col = section_x * 32 + lane_id;
-      auto const relative_row = section_y * 8;
+      auto const relative_row = section_y * 32;
       auto const absolute_col = relative_col + block.start_col;
       auto const absolute_row = relative_row + block.start_row;
       auto const cols_left = num_columns - absolute_col;
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
 
       if (absolute_col < num_columns) {
-        auto my_byte =
-            input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
+        auto my_data = input_nm[absolute_col] != nullptr ?
+                           input_nm[absolute_col][absolute_row / 32] :
+                           std::numeric_limits<uint32_t>::max();
 
         // every thread that is participating in the warp has a byte, but it's column-based
         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
         // make the bytes we actually write.
-        for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
-          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+        bitmask_type dw_mask = 1;
+        for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
           // lead thread in each warp writes data
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
@@ -585,11 +602,13 @@ __global__ void copy_validity_from_columns(
     // make sure entire block has finished copy
     group.sync();
 
+    auto const output_data_base =
+        output_data[block.buffer_num] + validity_offset + block.start_col / 8;
+
     // now async memcpy the shared memory out to the final destination
     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
       auto const relative_row = row - block.start_row;
-      auto const output_ptr =
-          output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
+      auto const output_ptr = output_data_base + row_offsets[row];
       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
 
       cuda::memcpy_async(
@@ -917,8 +936,6 @@ __global__ void copy_validity_to_columns(
     // now async memcpy the shared
     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
       auto const relative_col = col - block.start_col;
-      auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32);
-      auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       cuda::memcpy_async(
           output_nm[col] + word_index(block_start_row),
@@ -1111,7 +1128,7 @@ static size_type compute_column_information(iterator begin, iterator end,
     fixed_width_size_per_row += col_size;
   }
 
-  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  auto validity_offset = fixed_width_size_per_row;
   column_starts.push_back(validity_offset);
 
   return fixed_width_size_per_row;
@@ -1233,7 +1250,6 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
     if (row_size_with_end_pad * window_height +
             calc_admin_data_size(col - current_window_start_col) >
         shmem_limit_per_block) {
-
       // too large, close this window, generate vertical blocks and restart
       build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
       row_size =

From 8d00447eb0e9c6166b6f3a01b199dbd9c0a88c9a Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 21 Oct 2021 18:02:07 +0000
Subject: [PATCH 26/80] Cleaning up code for PR

---
 cpp/src/row_conversion/row_conversion.cu   | 4132 ++++++++------------
 java/src/main/native/src/row_conversion.cu |  237 +-
 2 files changed, 1740 insertions(+), 2629 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 90bd8b88ef0..c068a2c0b76 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -14,2487 +14,1653 @@
  * limitations under the License.
  */
 
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <tuple>
-#include <type_traits>
-#include "cudf/detail/iterator.cuh"
-#include "cudf/lists/lists_column_device_view.cuh"
-
-#include <cooperative_groups.h>
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-#include <cuda/barrier>
-#endif
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/sequence.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/row_conversion.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS      = 8;
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS    = 2;
-constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED          = 2;
-constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL        = 8;
-constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
-#endif
-
-using cudf::detail::make_device_uvector_async;
-using rmm::device_uvector;
-namespace cudf {
-
-namespace detail {
-
-static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment)
-{
-  return (offset + alignment - 1) & ~(alignment - 1);
-}
-
-__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
-                                            const cudf::size_type num_columns,
-                                            const cudf::size_type row_size,
-                                            const cudf::size_type* input_offset_in_row,
-                                            const cudf::size_type* num_bytes,
-                                            int8_t** output_data,
-                                            cudf::bitmask_type** output_nm,
-                                            const int8_t* input_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // For simplicity we will refer to this as a row_group
-
-  // In practice we have found writing more than 4 columns of data per thread
-  // results in performance loss. As such we are using a 2 dimensional
-  // kernel in terms of threads, but not in terms of blocks. Columns are
-  // controlled by the y dimension (there is no y dimension in blocks). Rows
-  // are controlled by the x dimension (there are multiple blocks in the x
-  // dimension).
-
-  cudf::size_type rows_per_group   = blockDim.x;
-  cudf::size_type row_group_start  = blockIdx.x;
-  cudf::size_type row_group_stride = gridDim.x;
-  cudf::size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
-
-  extern __shared__ int8_t shared_data[];
-
-  // Because we are copying fixed width only data and we stride the rows
-  // this thread will always start copying from shared data in the same place
-  int8_t* row_tmp     = &shared_data[row_size * threadIdx.x];
-  int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
-
-  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
-       row_group_index += row_group_stride) {
-    // Step 1: Copy the data into shared memory
-    // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t* long_shared      = reinterpret_cast<int64_t*>(shared_data);
-    const int64_t* long_input = reinterpret_cast<int64_t const*>(input_data);
-
-    cudf::size_type shared_output_index  = threadIdx.x + (threadIdx.y * blockDim.x);
-    cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
-    cudf::size_type row_index_end        = ((row_group_index + 1) * rows_per_group);
-    if (row_index_end > num_rows) { row_index_end = num_rows; }
-    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    cudf::size_type shared_length     = row_size * num_rows_in_group;
-
-    cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
-
-    cudf::size_type start_input_index =
-      (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
-
-    for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
-         shared_index += shared_output_stride) {
-      long_shared[shared_index] = long_input[start_input_index + shared_index];
-    }
-    // Wait for all of the data to be in shared memory
-    __syncthreads();
-
-    // Step 2 copy the data back out
-
-    // Within the row group there should be 1 thread for each row.  This is a
-    // requirement for launching the kernel
-    cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
-    // But we might not use all of the threads if the number of rows does not go
-    // evenly into the thread count. We don't want those threads to exit yet
-    // because we may need them to copy data in for the next row group.
-    uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
-    if (row_index < num_rows) {
-      cudf::size_type col_index_start  = threadIdx.y;
-      cudf::size_type col_index_stride = blockDim.y;
-      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
-           col_index += col_index_stride) {
-        cudf::size_type col_size = num_bytes[col_index];
-        const int8_t* col_tmp    = &(row_tmp[input_offset_in_row[col_index]]);
-        int8_t* col_output       = output_data[col_index];
-        switch (col_size) {
-          case 1: {
-            col_output[row_index] = *col_tmp;
-            break;
-          }
-          case 2: {
-            int16_t* short_col_output   = reinterpret_cast<int16_t*>(col_output);
-            short_col_output[row_index] = *reinterpret_cast<const int16_t*>(col_tmp);
-            break;
-          }
-          case 4: {
-            int32_t* int_col_output   = reinterpret_cast<int32_t*>(col_output);
-            int_col_output[row_index] = *reinterpret_cast<const int32_t*>(col_tmp);
-            break;
-          }
-          case 8: {
-            int64_t* long_col_output   = reinterpret_cast<int64_t*>(col_output);
-            long_col_output[row_index] = *reinterpret_cast<const int64_t*>(col_tmp);
-            break;
-          }
-          default: {
-            cudf::size_type output_offset = col_size * row_index;
-            // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
-              col_output[b + output_offset] = col_tmp[b];
-            }
-            break;
-          }
-        }
-
-        cudf::bitmask_type* nm          = output_nm[col_index];
-        int8_t* valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
-        int predicate                   = *valid_byte & (1 << byte_bit_offset);
-        uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
-      }  // end column loop
-    }    // end row copy
-    // wait for the row_group to be totally copied before starting on the next row group
-    __syncthreads();
-  }
-}
-
-__global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
-                                              const cudf::size_type num_rows,
-                                              const cudf::size_type num_columns,
-                                              const cudf::size_type row_size,
-                                              const cudf::size_type* output_offset_in_row,
-                                              const cudf::size_type* num_bytes,
-                                              const int8_t** input_data,
-                                              const cudf::bitmask_type** input_nm,
-                                              int8_t* output_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // We do not support copying a subset of the columns in a row yet, so we don't
-  // currently support a row that is wider than shared memory.
-  // For simplicity we will refer to this as a row_group
-
-  // In practice we have found reading more than 4 columns of data per thread
-  // results in performance loss. As such we are using a 2 dimensional
-  // kernel in terms of threads, but not in terms of blocks. Columns are
-  // controlled by the y dimension (there is no y dimension in blocks). Rows
-  // are controlled by the x dimension (there are multiple blocks in the x
-  // dimension).
-
-  cudf::size_type rows_per_group   = blockDim.x;
-  cudf::size_type row_group_start  = blockIdx.x;
-  cudf::size_type row_group_stride = gridDim.x;
-  cudf::size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
-
-  extern __shared__ int8_t shared_data[];
-
-  // Because we are copying fixed width only data and we stride the rows
-  // this thread will always start copying to shared data in the same place
-  int8_t* row_tmp = &shared_data[row_size * threadIdx.x];
-  int8_t* row_vld_tmp =
-    &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
-
-  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
-       row_group_index += row_group_stride) {
-    // Within the row group there should be 1 thread for each row.  This is a
-    // requirement for launching the kernel
-    cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
-    // But we might not use all of the threads if the number of rows does not go
-    // evenly into the thread count. We don't want those threads to exit yet
-    // because we may need them to copy data back out.
-    if (row_index < (start_row + num_rows)) {
-      cudf::size_type col_index_start  = threadIdx.y;
-      cudf::size_type col_index_stride = blockDim.y;
-      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
-           col_index += col_index_stride) {
-        cudf::size_type col_size = num_bytes[col_index];
-        int8_t* col_tmp          = &(row_tmp[output_offset_in_row[col_index]]);
-        const int8_t* col_input  = input_data[col_index];
-        switch (col_size) {
-          case 1: {
-            *col_tmp = col_input[row_index];
-            break;
-          }
-          case 2: {
-            const int16_t* short_col_input       = reinterpret_cast<const int16_t*>(col_input);
-            *reinterpret_cast<int16_t*>(col_tmp) = short_col_input[row_index];
-            break;
-          }
-          case 4: {
-            const int32_t* int_col_input         = reinterpret_cast<const int32_t*>(col_input);
-            *reinterpret_cast<int32_t*>(col_tmp) = int_col_input[row_index];
-            break;
-          }
-          case 8: {
-            const int64_t* long_col_input        = reinterpret_cast<const int64_t*>(col_input);
-            *reinterpret_cast<int64_t*>(col_tmp) = long_col_input[row_index];
-            break;
-          }
-          default: {
-            cudf::size_type input_offset = col_size * row_index;
-            // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
-              col_tmp[b] = col_input[b + input_offset];
-            }
-            break;
-          }
-        }
-        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
-        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t* valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
-        uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
-        int32_t* valid_int              = reinterpret_cast<int32_t*>(valid_byte - fixup_bytes);
-        cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
-        // Now copy validity for the column
-        if (input_nm[col_index]) {
-          if (bit_is_set(input_nm[col_index], row_index)) {
-            atomicOr_block(valid_int, 1 << int_bit_offset);
-          } else {
-            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
-          }
-        } else {
-          // It is valid so just set the bit
-          atomicOr_block(valid_int, 1 << int_bit_offset);
-        }
-      }  // end column loop
-    }    // end row copy
-    // wait for the row_group to be totally copied into shared memory
-    __syncthreads();
-
-    // Step 2: Copy the data back out
-    // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t* long_shared = reinterpret_cast<int64_t*>(shared_data);
-    int64_t* long_output = reinterpret_cast<int64_t*>(output_data);
-
-    cudf::size_type shared_input_index  = threadIdx.x + (threadIdx.y * blockDim.x);
-    cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
-    cudf::size_type row_index_end       = ((row_group_index + 1) * rows_per_group);
-    if (row_index_end > num_rows) { row_index_end = num_rows; }
-    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    cudf::size_type shared_length     = row_size * num_rows_in_group;
-
-    cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
-
-    cudf::size_type start_output_index =
-      (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
-
-    for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
-         shared_index += shared_input_stride) {
-      long_output[start_output_index + shared_index] = long_shared[shared_index];
-    }
-    __syncthreads();
-    // Go for the next round
-  }
-}
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-struct block_info {
-  int start_col;
-  int start_row;
-  int end_col;
-  int end_row;
-  int buffer_num;
-
-  __host__ __device__ size_type get_shared_row_size(size_type const* const col_offsets,
-                                                    size_type const* const col_sizes,
-                                                    bool debug_print = false) const
-  {
-    if (debug_print)
-      printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n",
-             end_col,
-             &col_offsets[end_col],
-             end_col,
-             &col_sizes[end_col],
-             start_col,
-             &col_offsets[start_col],
-             col_offsets[end_col],
-             col_sizes[end_col],
-             col_offsets[start_col]);
-    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
-  }
-  __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
-
-  __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
-};
-
-// When building the columns to return, we have to be mindful of the offset limit in cudf.
-// It is 32-bit and these data columns are capable of surpassing that easily. The data should
-// not be cut off exactly at the limit though due to the validity buffers. The most efficient
-// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-// we keep track of the cut points for the validity, which we call row batches. If the row
-// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-// the outgoing data stream.
-struct row_batch {
-  size_type num_bytes;
-  size_type row_count;
-};
-
-/**
- * @brief copy data from cudf columns into x format, which is row-based
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param input_data pointer to raw table data
- * @param input_nm pointer to validity data
- * @param col_sizes array of sizes for each element in a column - one per column
- * @param col_offsets offset into input data row for each column's start
- * @param block_infos information about the blocks of work
- * @param row_offsets offset to a specific row in the input data
- * @param output_data pointer to output data
- *
- */
-__global__ void copy_from_columns(const size_type num_rows,
-                                  const size_type num_columns,
-                                  const size_type shmem_used_per_block,
-                                  const size_type num_block_infos,
-                                  const int8_t** input_data,
-                                  const size_type* col_sizes,
-                                  const size_type* col_offsets,
-                                  const block_info* block_infos,
-                                  const size_type* row_offsets,
-                                  int8_t** output_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // This has been broken up for us in the block_info struct, so we don't have
-  // any calculation to do here, but it is important to note.
-
-  constexpr bool debug_print = false;  // blockIdx.x == 0 && threadIdx.x == 1;
-
-  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
-  auto group                      = cooperative_groups::this_thread_block();
-  extern __shared__ int8_t shared_data[];
-  int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
-
-  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
-  if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&block_barrier[i], group.size());
-    }
-  }
-
-  group.sync();
-
-  if (debug_print) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("col sizes at %p, col offsets at %p, and row offsets at %p\n",
-           col_sizes,
-           col_offsets,
-           row_offsets);
-    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
-    printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
-    printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
-    printf("group is %d threads\n", group.size());
-  }
-  //  else { return; }
-
-  auto const blocks_remaining =
-    std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS,
-             (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS);
-
-  size_t fetch;
-  size_t subset;
-  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
-    // Fetch ahead up to stages_count subsets
-    for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
-      if (debug_print)
-        printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch);
-      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch];
-      if (debug_print)
-        printf("block %lu rows %d-%d and cols %d-%d\n",
-               blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch,
-               fetch_block.start_row,
-               fetch_block.end_row,
-               fetch_block.start_col,
-               fetch_block.end_col);
-
-      auto const num_fetch_cols         = fetch_block.num_cols();
-      auto const num_fetch_rows         = fetch_block.num_rows();
-      auto const num_elements_in_block  = num_fetch_cols * num_fetch_rows;
-      auto const fetch_block_row_size   = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-      auto const starting_column_offset = col_offsets[fetch_block.start_col];
-      auto& fetch_barrier               = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
-
-      // wait for the last use of the memory to be completed
-      if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); }
-
-      // to do the copy we need to do n column copies followed by m element copies OR
-      // we have to do m element copies followed by r row copies. When going from column
-      // to row it is much easier to copy by elements first otherwise we would need a running
-      // total of the column sizes for our block, which isn't readily available. This makes it
-      // more appealing to copy element-wise from input data into shared matching the end layout
-      // and do row-based memcopies out.
-
-      auto const shared_buffer_base = shared[fetch % stages_count];
-      for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
-        auto const relative_col = el / num_fetch_rows;
-        auto const relative_row = el % num_fetch_rows;
-        auto const absolute_col = relative_col + fetch_block.start_col;
-        auto const absolute_row = relative_row + fetch_block.start_row;
-        if (debug_print)
-          printf("row %d(%d), col %d(%d), %d fetch rows, element %d\n",
-                 relative_row,
-                 absolute_row,
-                 relative_col,
-                 absolute_col,
-                 num_fetch_rows,
-                 el);
-        auto const col_size            = col_sizes[absolute_col];
-        auto const col_offset          = col_offsets[absolute_col];
-        auto const relative_col_offset = col_offset - starting_column_offset;
-
-        auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
-        auto const input_src     = input_data[absolute_col] + col_size * absolute_row;
-
-        if (debug_print)
-          printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n",
-                 fetch,
-                 fetch % stages_count,
-                 &shared_buffer_base[shared_offset],
-                 input_src,
-                 *input_src,
-                 col_size);
-
-        // copy the element from global memory
-        switch (col_size) {
-          case 2:
-            cuda::memcpy_async(&shared_buffer_base[shared_offset],
-                               input_src,
-                               cuda::aligned_size_t<2>(col_size),
-                               fetch_barrier);
-            break;
-          case 4:
-            cuda::memcpy_async(&shared_buffer_base[shared_offset],
-                               input_src,
-                               cuda::aligned_size_t<4>(col_size),
-                               fetch_barrier);
-            break;
-          case 8:
-            cuda::memcpy_async(&shared_buffer_base[shared_offset],
-                               input_src,
-                               cuda::aligned_size_t<8>(col_size),
-                               fetch_barrier);
-            break;
-          default:
-            cuda::memcpy_async(
-              &shared_buffer_base[shared_offset], input_src, col_size, fetch_barrier);
-            break;
-        }
-      }
-    }
-
-    auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
-    subset_barrier.arrive_and_wait();
-
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
-    if (debug_print)
-      printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset);
-
-    auto const block_row_size      = block.get_shared_row_size(col_offsets, col_sizes);
-    auto const column_offset       = col_offsets[block.start_col];
-    auto const block_output_buffer = output_data[block.buffer_num];
-
-    // copy entire rows to final dest
-    for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
-         absolute_row += blockDim.x) {
-      auto const relative_row = absolute_row - block.start_row;
-      auto const output_dest  = block_output_buffer + row_offsets[absolute_row] + column_offset;
-      if (debug_print)
-        printf("processing row %d\noutput data[%d] is address %p\n",
-               absolute_row,
-               absolute_row,
-               output_dest);
-      auto const shared_offset = block_row_size * relative_row;
-      if (debug_print)
-        printf("memcpy %p <- %p - %d bytes which is row %d\n",
-               output_dest,
-               &shared[subset % stages_count][shared_offset],
-               block_row_size,
-               absolute_row);
-
-      cuda::memcpy_async(output_dest,
-                         &shared[subset % stages_count][shared_offset],
-                         cuda::aligned_size_t<8>(block_row_size),
-                         subset_barrier);
-    }
-  }
-
-  // wait on the last copies to complete
-  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-    block_barrier[i].arrive_and_wait();
-  }
-}
-
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
- * @param output_data pointer to output data, partitioned by data size
- * @param validity_offsets offset into input data row for validity data
- * @param block_infos information about the blocks of work
- * @param num_block_infos number of infos in blocks array
- * @param input_data pointer to input data
- *
- */
-__global__ void copy_validity_from_columns(const size_type num_rows,
-                                           const size_type num_columns,
-                                           const size_type shmem_used_per_block,
-                                           const size_type* row_offsets,
-                                           int8_t** output_data,
-                                           const size_type validity_offset,
-                                           const block_info* block_infos,
-                                           const size_type num_block_infos,
-                                           const bitmask_type** input_nm)
-{
-  extern __shared__ int8_t shared_data[];
-  int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-    shared_data, shared_data + shmem_used_per_block / 2};
-
-  constexpr bool print_debug = false;  // threadIdx.x==0 && blockIdx.x == 0;
-  //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
-  if (print_debug) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("%d %d - block infos are at %p and my index is %d\n",
-           threadIdx.x,
-           blockIdx.x,
-           block_infos,
-           blockIdx.x);
-    printf("%d %d - input nm is %p, input_nm[0] is at %p\n",
-           threadIdx.x,
-           blockIdx.x,
-           input_nm,
-           input_nm[0]);
-    printf("shared memory is %p to %p\n", shared_data, shared_data + shmem_used_per_block * 2);
-    printf("block infos at %p and this is index %d\n",
-           &block_infos,
-           blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + 0);
-    /*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-    printf("%d: %d\n", i, row_offsets[i]);
-    }*/
-  }
-  // else { return; }
-
-  // per conversation with DaveB
-  // each thread of warp reads a single int32 of validity - so we read 128 bytes
-  // then ballot_sync the bits and write the result to shmem
-  // after we fill shared mem memcpy it out in a blob.
-  // probably need knobs for number of rows vs columns to balance read/write
-  auto group = cooperative_groups::this_thread_block();
-
-  int const blocks_remaining =
-    std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-             (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
-
-  if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size());
-
-  __shared__ cuda::barrier<cuda::thread_scope_block>
-    shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
-  if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&shared_block_barriers[i], group.size());
-    }
-  }
-
-  group.sync();
-
-  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
-      if (print_debug)
-        printf("%d: waiting at barrier %d\n",
-               threadIdx.x,
-               validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED);
-      shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
-        .arrive_and_wait();
-      if (print_debug) printf("past barrier...\n");
-    }
-    int8_t* this_shared_block = shared_blocks[validity_block % 2];
-    if (print_debug) printf("top of loop for validity block %d\n", validity_block);
-    if (print_debug)
-      printf("reading validity block info %d at %p\n",
-             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
-             &block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]);
-    auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
-
-    auto const num_block_cols = block.num_cols();
-    auto const num_block_rows = block.num_rows();
-
-    auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
-    auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
-    auto const validity_data_row_length =
-      align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
-    auto const total_sections = num_sections_x * num_sections_y;
-
-    if (print_debug) {
-      printf("%d %d - block %d has %d cols, %d rows, %d row length, and %d total sections\n",
-             threadIdx.x,
-             blockIdx.x,
-             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
-             num_block_cols,
-             num_block_rows,
-             validity_data_row_length,
-             total_sections);
-    }
-    int const warp_id          = threadIdx.x / detail::warp_size;
-    int const lane_id          = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
-
-    if (print_debug)
-      printf(
-        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, "
-        "blockDim.x=%d, "
-        "warp size "
-        "%d\n",
-        threadIdx.x,
-        blockIdx.x,
-        warp_id,
-        total_sections,
-        num_sections_x,
-        num_sections_y,
-        warps_per_block,
-        blockDim.x,
-        detail::warp_size);
-    // the block is divided into sections. A warp operates on a section at a time.
-    for (int my_section_idx = warp_id; my_section_idx < total_sections;
-         my_section_idx += warps_per_block) {
-      // convert to rows and cols
-      auto const section_x = my_section_idx % num_sections_x;
-      auto const section_y = my_section_idx / num_sections_x;
-
-      if (print_debug)
-        printf("working on section %d,%d - %d of %d...\n",
-               section_x,
-               section_y,
-               my_section_idx,
-               total_sections);
-      auto const relative_col = section_x * 32 + lane_id;
-      auto const relative_row = section_y * 32;
-      auto const absolute_col = relative_col + block.start_col;
-      auto const absolute_row = relative_row + block.start_row;
-      auto const cols_left    = num_columns - absolute_col;
-
-      if (print_debug) printf("pre ballot sync...\n");
-      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
-
-      if (print_debug)
-        printf(
-          "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d "
-          "absolute)\n",
-          participation_mask,
-          relative_row,
-          absolute_row,
-          relative_col,
-          absolute_col);
-
-      if (absolute_col < num_columns) {
-        if (print_debug)
-          printf(
-            "thread %d's byte is at %p, participation mask is 0x%x for relative row %d(%d real), "
-            "relative col %d(%d absolute)\n",
-            threadIdx.x,
-            &input_nm[absolute_col][absolute_row / 32],
-            participation_mask,
-            relative_row,
-            absolute_row,
-            relative_col,
-            absolute_col);
-        auto my_data = input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32]
-                                                         : std::numeric_limits<uint32_t>::max();
-
-        if (print_debug)
-          printf(
-            "thread %d's bytes are 0x%x, participation mask is 0x%x for relative row %d(%d real), "
-            "relative col %d(%d absolute)\n",
-            threadIdx.x,
-            my_data,
-            participation_mask,
-            relative_row,
-            absolute_row,
-            relative_col,
-            absolute_col);
-
-        // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
-        // make the bytes we actually write.
-        bitmask_type dw_mask = 1;
-        for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
-          auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
-          // lead thread in each warp writes data
-          auto const validity_write_offset =
-            validity_data_row_length * (relative_row + i) + relative_col / 8;
-          if (threadIdx.x % detail::warp_size == 0) {
-            if (print_debug)
-              printf(
-                "%d %d - byte_mask is 0x%x, masked_byte is 0x%x, shared_data_block[%d][%d] = "
-                "0x%x\n",
-                threadIdx.x,
-                blockIdx.x,
-                dw_mask,
-                my_data & dw_mask,
-                validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
-                validity_write_offset,
-                validity_data);
-            if (cols_left <= 8) {
-              // write byte
-              if (print_debug)
-                printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n",
-                       threadIdx.x,
-                       blockIdx.x,
-                       validity_write_offset,
-                       &this_shared_block[validity_write_offset]);
-              this_shared_block[validity_write_offset] = validity_data & 0xFF;
-            } else if (cols_left <= 16) {
-              // write int16
-              if (print_debug)
-                printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n",
-                       threadIdx.x,
-                       blockIdx.x,
-                       validity_write_offset,
-                       &this_shared_block[validity_write_offset]);
-              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data & 0xFFFF;
-            } else if (cols_left <= 24) {
-              // write int16 and then int8
-              if (print_debug)
-                printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n",
-                       threadIdx.x,
-                       blockIdx.x,
-                       validity_write_offset,
-                       &this_shared_block[validity_write_offset]);
-              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data & 0xFFFF;
-              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-            } else {
-              // write int32
-              if (print_debug)
-                printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n",
-                       threadIdx.x,
-                       blockIdx.x,
-                       validity_write_offset,
-                       &this_shared_block[validity_write_offset]);
-              *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data;
-            }
-          }
-        }
-      }
-    }
-
-    // make sure entire block has finished copy
-    group.sync();
-
-    auto const output_data_base =
-      output_data[block.buffer_num] + validity_offset + block.start_col / 8;
-
-    // now async memcpy the shared memory out to the final destination
-    for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
-      auto const relative_row = row - block.start_row;
-      if (print_debug) {
-        printf(
-          "base output data is %p, row offset is 0x%x, validity offset into row is 0x%x, word "
-          "index of block start is 0x%x\n",
-          output_data[block.buffer_num],
-          row_offsets[row],
-          validity_offset,
-          word_index(block.start_col));
-        printf(
-          "%d %d - row %d/%d/%d col %d-%d - %p = shared_data_block[%d][%d] which is %p -  %d "
-          "bytes\n - %p <- 0x%x\n",
-          threadIdx.x,
-          blockIdx.x,
-          block.start_row,
-          row,
-          block.end_row,
-          block.start_col,
-          block.end_col,
-          output_data[block.buffer_num] + row_offsets[row] + validity_offset +
-            (word_index(block.start_col)),
-          validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
-          validity_data_row_length * relative_row,
-          &this_shared_block[validity_data_row_length * relative_row],
-          util::div_rounding_up_unsafe(num_block_cols, 8),
-          output_data[block.buffer_num] + row_offsets[row] + validity_offset +
-            word_index(block.start_col),
-          this_shared_block[validity_data_row_length * relative_row]);
-      }
-      auto const output_ptr = output_data_base + row_offsets[row];
-      auto const num_bytes  = util::div_rounding_up_unsafe(num_block_cols, 8);
-
-      cuda::memcpy_async(
-        output_ptr,
-        &this_shared_block[validity_data_row_length * relative_row],
-        num_bytes,
-        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-    }
-  }
-
-  // wait for last blocks of data to arrive
-  for (int validity_block = 0;
-       validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-       ++validity_block) {
-    shared_block_barriers[validity_block].arrive_and_wait();
-  }
-}
-
-static __device__ std::tuple<size_type, size_type> get_admin_data_sizes(size_t col_size_size,
-                                                                        size_t col_offset_size,
-                                                                        int const num_cols)
-{
-  auto const col_size_bytes   = num_cols * col_size_size;
-  auto const col_offset_bytes = num_cols * col_offset_size;
-
-  return {col_size_bytes, col_offset_bytes};
-}
-
-/**
- * @brief ensure `read_ahead` buffer blocks are fetched
- *
- * @param fetch_index internal state passed into the function
- * @param processing_index index where processing is occuring
- * @param read_ahead_count how many blocks to read ahead
- * @param max_resident_blocks how many blocks can be loaded at once
- * @param total_blocks total number of blocks overall
- * @param block_infos pointer to the block infos
- * @param col_sizes pointer to column size information
- * @param col_offsets pointer to the table's column offsets
- * @param row_offsets pointer to offsets for each row in the table
- * @param input_data pointer to the input data
- * @param shared pointer to shared memory
- * @param group thread group participating in the fetch
- * @param block_barrier barriers used for each block
- * @param debug_print
- * @return
- */
-static __device__ void fetch_blocks_for_row_to_column(
-  size_t& fetch_index,
-  size_t const processing_index,
-  int const read_ahead_count,
-  int const max_resident_blocks,
-  int const total_blocks,
-  block_info const* const block_infos,
-  size_type const* const col_sizes,
-  size_type const* const col_offsets,
-  size_type const* const row_offsets,
-  int8_t const* const input_data,
-  int8_t* shared[],
-  cooperative_groups::thread_block const group,
-  cuda::barrier<cuda::thread_scope_block>* block_barrier,
-  bool debug_print)
-{
-  for (; fetch_index < static_cast<size_t>(total_blocks) &&
-         fetch_index < (processing_index + read_ahead_count);
-       ++fetch_index) {
-    auto const fetch_block =
-      block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
-    if (debug_print)
-      printf(
-        "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, "
-        "ending "
-        "offset %p\n",
-        blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
-        total_blocks,
-        fetch_block.start_col,
-        fetch_block.end_col,
-        &col_offsets[fetch_block.start_col],
-        &col_offsets[fetch_block.end_col]);
-    auto const fetch_block_start_row = fetch_block.start_row;
-    auto const fetch_block_end_row   = fetch_block.end_row;
-    auto const starting_col_offset   = col_offsets[fetch_block.start_col];
-
-    auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-    auto const num_fetch_cols       = fetch_block.num_cols();
-    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-      sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
-    auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
-
-    // if we have fetched all buffers, we need to wait for processing
-    // to complete on them before we can use them again
-    if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); }
-
-    auto shared_row_offset = 0;
-    // copy the data for column sizes
-    if (debug_print)
-      printf("%d: col sizes memcpy_async(group, %p, %p, %d, barrier);\n",
-             threadIdx.x,
-             &shared[fetch_index % max_resident_blocks][shared_row_offset],
-             &col_offsets[fetch_block.start_col],
-             col_size_bytes);
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
-      printf("%d-%d fetching to %p with barrier %p\n",
-             threadIdx.x,
-             blockIdx.x,
-             shared[fetch_index % max_resident_blocks],
-             &fetch_barrier);
-    cuda::memcpy_async(group,
-                       &shared[fetch_index % max_resident_blocks][shared_row_offset],
-                       &col_sizes[fetch_block.start_col],
-                       col_size_bytes,
-                       fetch_barrier);
-    shared_row_offset += col_size_bytes;
-    // copy the data for column offsets
-    if (debug_print)
-      printf("%d: offsets memcpy_async(group, %p, %p, %d, barrier);\n",
-             threadIdx.x,
-             &shared[fetch_index % max_resident_blocks][shared_row_offset],
-             &col_offsets[fetch_block.start_col],
-             col_offset_bytes);
-    cuda::memcpy_async(group,
-                       &shared[fetch_index % max_resident_blocks][shared_row_offset],
-                       &col_offsets[fetch_block.start_col],
-                       col_offset_bytes,
-                       fetch_barrier);
-    shared_row_offset += col_offset_bytes;
-    shared_row_offset = align_offset(shared_row_offset, 8);
-
-    for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
-         row <= fetch_block_end_row;
-         row += blockDim.x) {
-      auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
-      if (debug_print)
-        printf(
-          "%d - fetching block %lu to shared chunk %lu. %p(shared[%d %% %d][%d]) <- %p(row %d, row "
-          "offset %d starting col offset %d)\n",
-          threadIdx.x,
-          fetch_index,
-          fetch_index % max_resident_blocks,
-          &shared[fetch_index % max_resident_blocks][shared_offset],
-          (int)fetch_index,
-          max_resident_blocks,
-          shared_offset,
-          &input_data[row_offsets[row] + starting_col_offset],
-          row,
-          row_offsets[row],
-          starting_col_offset);
-      // copy the main
-      cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
-                         &input_data[row_offsets[row] + starting_col_offset],
-                         fetch_block_row_size,
-                         fetch_barrier);
-    }
-  }
-}
-
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param row_offsets
- * @param output_data
- * @param output_nm
- * @param col_sizes array of sizes for each element in a column - one per column
- * @param col_offsets offset into input data row for each column's start
- * @param block_infos information about the blocks of work
- * @param input_data pointer to input data
- *
- */
-__global__ void copy_to_columns(const size_type num_rows,
-                                const size_type num_columns,
-                                const size_type shmem_used_per_block,
-                                const size_type* row_offsets,
-                                int8_t** output_data,
-                                const size_type* _col_sizes,
-                                const size_type* _col_offsets,
-                                const block_info* block_infos,
-                                const size_type num_block_infos,
-                                const int8_t* input_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // This has been broken up for us in the block_info struct, so we don't have
-  // any calculation to do here, but it is important to note.
-
-  // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
-  // to shared memory for each of the blocks that we work on
-
-  constexpr bool debug_print      = false;  // threadIdx.x == 2 && blockIdx.x == 0;
-  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
-  auto group                      = cooperative_groups::this_thread_block();
-  extern __shared__ int8_t shared_data[];
-  int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
-
-  if (debug_print) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf(
-      "%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x);
-    /*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-    printf("%d: %d\n", i, row_offsets[i]);
-    }*/
-    printf("Row offsets are at %p\n", row_offsets);
-    printf("col sizes are at %p and col offsets at %p\n", _col_sizes, _col_offsets);
-    printf("output data to %p and input data at %p\n",
-           output_data[block_infos[blockIdx.x].buffer_num],
-           input_data);
-    printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
-    printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
-    printf("group is %d threads\n", group.size());
-  }
-  //  else { return; }
-
-  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
-  if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&block_barrier[i], group.size());
-    }
-  }
-
-  group.sync();
-
-  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
-                                   (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
-
-  auto get_admin_data_sizes = [col_size_size   = sizeof(decltype(*_col_sizes)),
-                               col_offset_size = sizeof(decltype(*_col_offsets))](
-                                int const num_cols,
-                                int const num_rows) -> std::tuple<size_type, size_type> {
-    auto const col_size_bytes   = num_cols * col_size_size;
-    auto const col_offset_bytes = num_cols * col_offset_size;
-
-    return {col_size_bytes, col_offset_bytes};
-  };
-
-  if (debug_print)
-    printf("%d blocks remaining -> %d block infos, %d block index\n",
-           blocks_remaining,
-           num_block_infos,
-           blockIdx.x);
-  size_t fetch;
-  size_t subset;
-  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
-    // Fetch ahead up to stages_count subsets
-    fetch_blocks_for_row_to_column(fetch,
-                                   subset,
-                                   stages_count,
-                                   stages_count,
-                                   blocks_remaining,
-                                   block_infos,
-                                   _col_sizes,
-                                   _col_offsets,
-                                   row_offsets,
-                                   input_data,
-                                   shared,
-                                   group,
-                                   block_barrier,
-                                   debug_print);
-
-    auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
-    // ensure our data is ready
-    if (debug_print)
-      printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
-    subset_barrier.arrive_and_wait();
-
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-    if (debug_print)
-      printf("%d-%d reading block %lu at address %p\n",
-             threadIdx.x,
-             blockIdx.x,
-             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset,
-             shared[subset % stages_count]);
-
-    auto const rows_in_block = block.num_rows();
-    auto const cols_in_block = block.num_cols();
-
-    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block);
-    // auto shared_row_offsets = shared[subset];
-    auto shared_col_sizes = reinterpret_cast<size_type*>(shared[subset % stages_count]);
-    auto shared_col_offsets =
-      reinterpret_cast<size_type*>(&shared[subset % stages_count][col_size_bytes]);
-
-    auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
-
-    auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes, debug_print);
-
-    // now we copy from shared memory to final destination.
-    // the data is laid out in rows in shared memory, so the reads
-    // for a column will be "vertical". Because of this and the different
-    // sizes for each column, this portion is handled on row/column basis.
-    // to prevent each thread working on a single row and also to ensure
-    // that all threads can do work in the case of more threads than rows,
-    // we do a global index instead of a double for loop with col/row.
-    for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
-      auto const relative_col = index % cols_in_block;
-      auto const relative_row = index / cols_in_block;
-      auto const absolute_col = relative_col + block.start_col;
-      auto const absolute_row = relative_row + block.start_row;
-
-      if (debug_print)
-        printf("copying for row %d(%d absolute) col %d(%d absolute)\n",
-               relative_row,
-               absolute_row,
-               relative_col,
-               absolute_col);
-
-      auto const shared_memory_row_offset = block_row_size * relative_row;
-      if (debug_print)
-        printf("shared_col_offsets is %p and relative col is %d, making me access %p\n",
-               shared_col_offsets,
-               relative_col,
-               &shared_col_offsets[relative_col]);
-      auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
-                                        shared_memory_row_offset + shared_row_offset;
-      if (debug_print)
-        printf("shared_col_sizes is %p and relative col is %d, making me access %p\n",
-               shared_col_sizes,
-               relative_col,
-               &shared_col_sizes[relative_col]);
-      auto const column_size = shared_col_sizes[relative_col];
-
-      int8_t* shmem_src = &shared[subset % stages_count][shared_memory_offset];
-      int8_t* dst       = &output_data[absolute_col][absolute_row * column_size];
-
-      if (debug_print) {
-        printf(
-           "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, "
-           "shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
-           " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n",
-           relative_col,
-           relative_row,
-           absolute_col,
-           absolute_row,
-           shared_memory_row_offset,
-           shared_memory_offset,
-           column_size,
-           shmem_src,
-           dst/*,
-           *reinterpret_cast<uint32_t*>(shmem_src)*/);
-        printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size);
-      }
-      if (debug_print && absolute_col == 0 && absolute_row == 51) {
-        printf("col0row51(%d bytes) = %p - 0x", column_size, shmem_src);
-        for (int i = 0; i < column_size; ++i) {
-          printf("%x ", shmem_src[i]);
-        }
-        printf("\n");
-      }
-
-      cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
-    }
-    group.sync();
-    if (debug_print)
-      printf(
-        "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
-  }
-
-  // wait on the last copies to complete
-  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-    block_barrier[i].arrive_and_wait();
-  }
-}
-
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
- * @param output_nm
- * @param validity_offsets offset into input data row for validity data
- * @param block_infos information about the blocks of work
- * @param num_block_infos number of infos in blocks array
- * @param input_data pointer to input data
- *
- */
-__global__ void copy_validity_to_columns(const size_type num_rows,
-                                         const size_type num_columns,
-                                         const size_type shmem_used_per_block,
-                                         const size_type* row_offsets,
-                                         cudf::bitmask_type** output_nm,
-                                         const size_type validity_offset,
-                                         const block_info* block_infos,
-                                         const size_type num_block_infos,
-                                         const int8_t* input_data)
-{
-  extern __shared__ int8_t shared_data[];
-  int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-    shared_data, shared_data + shmem_used_per_block / 2};
-
-  constexpr bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
-  if (print_debug) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("%d %d - block infos are at %p and my index is %d\n",
-           threadIdx.x,
-           blockIdx.x,
-           block_infos,
-           blockIdx.x);
-    printf(
-      "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, "
-      "row "
-      "offsets are %p, block infos at %p\n",
-      threadIdx.x,
-      blockIdx.x,
-      shared_data,
-      shared_data + shmem_used_per_block,
-      input_data,
-      output_nm,
-      row_offsets,
-      block_infos);
-  }
-  // else { return; }
-
-  // per conversation with DaveB
-  // each thread of warp reads a single byte of validity - so we read 32 bytes
-  // then ballot_sync the bits and write the result to shmem
-  // after we fill shared mem memcpy it out in a blob.
-  // probably need knobs for number of rows vs columns to balance read/write
-  auto group = cooperative_groups::this_thread_block();
-
-  int const blocks_remaining =
-    std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-             (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
-
-  if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size());
-
-  __shared__ cuda::barrier<cuda::thread_scope_block>
-    shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
-  if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&shared_block_barriers[i], group.size());
-    }
-  }
-
-  group.sync();
-
-  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-    if (validity_block != validity_index) {
-      shared_block_barriers[validity_index].arrive_and_wait();
-    }
-    int8_t* this_shared_block = shared_blocks[validity_block % 2];
-    auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
-    auto const block_start_col = block.start_col;
-    auto const block_start_row = block.start_row;
-
-    auto const num_block_cols = block.num_cols();
-    auto const num_block_rows = block.num_rows();
-
-    auto const num_sections_x           = (num_block_cols + 7) / 8;
-    auto const num_sections_y           = (num_block_rows + 31) / 32;
-    auto const validity_data_col_length = num_sections_y * 4;  // words to bytes
-    auto const total_sections           = num_sections_x * num_sections_y;
-
-    if (print_debug) {
-      printf("%d %d - block %d has %d cols, %d rows, and %d total sections\n",
-             threadIdx.x,
-             blockIdx.x,
-             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
-             num_block_cols,
-             num_block_rows,
-             total_sections);
-    }
-    int const warp_id          = threadIdx.x / detail::warp_size;
-    int const lane_id          = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
-
-    if (print_debug)
-      printf(
-        "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side "
-        "%d\n",
-        threadIdx.x,
-        blockIdx.x,
-        warp_id,
-        total_sections,
-        warps_per_block,
-        blockDim.x,
-        detail::warp_size);
-    // the block is divided into sections. A warp operates on a section at a time.
-    for (int my_section_idx = warp_id; my_section_idx < total_sections;
-         my_section_idx += warps_per_block) {
-      // convert to rows and cols
-      auto const section_x = my_section_idx % num_sections_x;
-      auto const section_y = my_section_idx / num_sections_x;
-
-      auto const relative_col = section_x * 8;
-      auto const relative_row = section_y * 32 + lane_id;
-      auto const absolute_col = relative_col + block_start_col;
-      auto const absolute_row = relative_row + block_start_row;
-      auto const rows_left    = num_rows - absolute_row;
-
-      /*      if (print_debug)
-              printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
-                     threadIdx.x,
-                     blockIdx.x,
-                     my_section_idx,
-                     num_sections_x,
-                     num_sections_y,
-                     section_x,
-                     section_y,
-                     absolute_row,
-                     num_rows,
-                     relative_col,
-                     relative_row);*/
-      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
-
-      if (absolute_row < num_rows) {
-        auto const my_byte =
-          input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
-
-        // so every thread that is participating in the warp has a byte, but it's row-based
-        // data and we need it in column-based. So we shiffle the bits around to make
-        // the bytes we actually write.
-        for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
-             ++i, byte_mask <<= 1) {
-          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
-          // lead thread in each warp writes data
-          if (threadIdx.x % detail::warp_size == 0) {
-            auto const validity_write_offset =
-              validity_data_col_length * (relative_col + i) + relative_row / 8;
-
-            if (print_debug)
-              printf(
-                "%d - Writing validity data for column %d, row %d 0x%x to shared memory location "
-                "%d(%d * (%d + %d) + %d / 8)\n",
-                threadIdx.x,
-                absolute_col + i,
-                absolute_row,
-                validity_data,
-                validity_write_offset,
-                validity_data_col_length,
-                relative_col,
-                i,
-                relative_row);
-
-            if (rows_left <= 8) {
-              // write byte
-              this_shared_block[validity_write_offset] = validity_data & 0xFF;
-            } else if (rows_left <= 16) {
-              // write int16
-              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data & 0xFFFF;
-            } else if (rows_left <= 24) {
-              // write int16 and then int8
-              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data & 0xFFFF;
-              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-            } else {
-              // write int32
-              *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data;
-            }
-          }
-        }
-      }
-    }
-
-    // make sure entire block has finished copy
-    group.sync();
-
-    // now async memcpy the shared
-    for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
-      auto const relative_col     = col - block.start_col;
-      auto const words_to_copy    = util::div_rounding_up_unsafe(num_block_rows, 32);
-      auto const starting_address = output_nm[col] + word_index(block_start_row);
-
-      if (print_debug)
-        printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier); - 0x%x\n",
-               threadIdx.x,
-               blockIdx.x,
-               col,
-               starting_address,
-               word_index(block_start_row),
-               &this_shared_block[validity_data_col_length * relative_col],
-               words_to_copy * 4,
-               this_shared_block[validity_data_col_length * relative_col]);
-      cuda::memcpy_async(
-        output_nm[col] + word_index(block_start_row),
-        &this_shared_block[validity_data_col_length * relative_col],
-        util::div_rounding_up_unsafe(num_block_rows, 8),
-        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-    }
-  }
-
-  // wait for last blocks of data to arrive
-  auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
-                                    ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
-                                    : blocks_remaining;
-  for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
-    shared_block_barriers[validity_block].arrive_and_wait();
-  }
-}
-
-#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-/**
- * Calculate the dimensions of the kernel for fixed width only columns.
- * @param [in] num_columns the number of columns being copied.
- * @param [in] num_rows the number of rows being copied.
- * @param [in] size_per_row the size each row takes up when padded.
- * @param [out] blocks the size of the blocks for the kernel
- * @param [out] threads the size of the threads for the kernel
- * @return the size in bytes of shared memory needed for each block.
- */
-static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
-                                        const cudf::size_type num_rows,
-                                        const cudf::size_type size_per_row,
-                                        dim3& blocks,
-                                        dim3& threads)
-{
-  // We have found speed degrades when a thread handles more than 4 columns.
-  // Each block is 2 dimensional. The y dimension indicates the columns.
-  // We limit this to 32 threads in the y dimension so we can still
-  // have at least 32 threads in the x dimension (1 warp) which should
-  // result in better coalescing of memory operations. We also
-  // want to guarantee that we are processing a multiple of 32 threads
-  // in the x dimension because we use atomic operations at the block
-  // level when writing validity data out to main memory, and that would
-  // need to change if we split a word of validity data between blocks.
-  int y_block_size = (num_columns + 3) / 4;  // cudf::util::div_rounding_up_safe(num_columns, 4);
-  if (y_block_size > 32) { y_block_size = 32; }
-  int x_possible_block_size = 1024 / y_block_size;
-  // 48KB is the default setting for shared memory per block according to the cuda tutorials
-  // If someone configures the GPU to only have 16 KB this might not work.
-  int max_shared_size = 48 * 1024;
-  int max_block_size  = max_shared_size / size_per_row;
-  // If we don't have enough shared memory there is no point in having more threads
-  // per block that will just sit idle
-  max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
-  // Make sure that the x dimension is a multiple of 32 this not only helps
-  // coalesce memory access it also lets us do a ballot sync for validity to write
-  // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
-  // dimension is associated with one or more warps, that should correspond to the validity
-  // words directly.
-  int block_size = (max_block_size / 32) * 32;
-  CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
-
-  int num_blocks = (num_rows + block_size - 1) / block_size;
-  if (num_blocks < 1) {
-    num_blocks = 1;
-  } else if (num_blocks > 10240) {
-    // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
-    // but in practice haveing too many can cause some overhead that I don't totally
-    // understand. Playing around with this haveing as little as 600 blocks appears
-    // to be able to saturate memory on V100, so this is an order of magnitude higher
-    // to try and future proof this a bit.
-    num_blocks = 10240;
-  }
-  blocks.x  = num_blocks;
-  blocks.y  = 1;
-  blocks.z  = 1;
-  threads.x = block_size;
-  threads.y = y_block_size;
-  threads.z = 1;
-  return size_per_row * block_size;
-}
-
-/**
- * When converting to rows it is possible that the size of the table was too big to fit
- * in a single column. This creates an output column for a subset of the rows in a table
- * going from start row and containing the next num_rows.  Most of the parameters passed
- * into this function are common between runs and should be calculated once.
- */
-static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
-  const cudf::size_type start_row,
-  const cudf::size_type num_rows,
-  const cudf::size_type num_columns,
-  const cudf::size_type size_per_row,
-  rmm::device_uvector<cudf::size_type>& column_start,
-  rmm::device_uvector<cudf::size_type>& column_size,
-  rmm::device_uvector<const int8_t*>& input_data,
-  rmm::device_uvector<const cudf::bitmask_type*>& input_nm,
-  const cudf::scalar& zero,
-  const cudf::scalar& scalar_size_per_row,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  int64_t total_allocation = size_per_row * num_rows;
-  // We made a mistake in the split somehow
-  CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
-
-  // Allocate and set the offsets row for the byte array
-  std::unique_ptr<cudf::column> offsets =
-    cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
-
-  std::unique_ptr<cudf::column> data =
-    cudf::make_numeric_column(cudf::data_type(cudf::type_id::INT8),
-                              static_cast<cudf::size_type>(total_allocation),
-                              cudf::mask_state::UNALLOCATED,
-                              stream,
-                              mr);
-
-  dim3 blocks;
-  dim3 threads;
-  int shared_size =
-    detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
-
-  copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-    start_row,
-    num_rows,
-    num_columns,
-    size_per_row,
-    column_start.data(),
-    column_size.data(),
-    input_data.data(),
-    input_nm.data(),
-    data->mutable_view().data<int8_t>());
-
-  return cudf::make_lists_column(num_rows,
-                                 std::move(offsets),
-                                 std::move(data),
-                                 0,
-                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-                                 stream,
-                                 mr);
-}
-
-static cudf::data_type get_data_type(const cudf::column_view& v) { return v.type(); }
-
-static inline bool are_all_fixed_width(std::vector<cudf::data_type> const& schema)
-{
-  return std::all_of(
-    schema.begin(), schema.end(), [](const cudf::data_type& t) { return cudf::is_fixed_width(t); });
-}
-
-/**
- * Given a set of fixed width columns, calculate how the data will be laid out in memory.
- * @param [in] schema the types of columns that need to be laid out.
- * @param [out] column_start the byte offset where each column starts in the row.
- * @param [out] column_size the size in bytes of the data for each columns in the row.
- * @return the size in bytes each row needs.
- */
-static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const& schema,
-                                                 std::vector<cudf::size_type>& column_start,
-                                                 std::vector<cudf::size_type>& column_size)
-{
-  // We guarantee that the start of each column is 64-bit aligned so anything can go
-  // there, but to make the code simple we will still do an alignment for it.
-  int32_t at_offset = 0;
-  for (auto col = schema.begin(); col < schema.end(); col++) {
-    cudf::size_type s = cudf::size_of(*col);
-    column_size.emplace_back(s);
-    std::size_t allocation_needed = s;
-    std::size_t alignment_needed  = allocation_needed;  // They are the same for fixed width types
-    at_offset                     = align_offset(at_offset, alignment_needed);
-    column_start.emplace_back(at_offset);
-    at_offset += allocation_needed;
-  }
-
-  // Now we need to add in space for validity
-  // Eventually we can think about nullable vs not nullable, but for now we will just always add
-  // it in
-  int32_t validity_bytes_needed =
-    (schema.size() + 7) / 8;  // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
-  // validity comes at the end and is byte aligned so we can pack more in.
-  at_offset += validity_bytes_needed;
-  // Now we need to pad the end so all rows are 64 bit aligned
-  return align_offset(at_offset, 8);  // 8 bytes (64 bits)
-}
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-template <typename iterator>
-static size_type compute_column_information(iterator begin,
-                                            iterator end,
-                                            std::vector<size_type>& column_starts,
-                                            std::vector<size_type>& column_sizes)  //,
-// std::function<void(T)> nested_type_cb)
-{
-  size_type fixed_width_size_per_row = 0;
-  for (auto cv = begin; cv != end; ++cv) {
-    auto col_type    = std::get<0>(*cv);
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
-
-    // a list or string column will write a single uint64
-    // of data here for offset/length
-    auto col_size = nested_type ? 8 : size_of(col_type);
-
-    // align size for this type
-    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-    column_starts.push_back(fixed_width_size_per_row);
-    column_sizes.push_back(col_size);
-    fixed_width_size_per_row += col_size;
-  }
-
-  auto validity_offset = fixed_width_size_per_row;
-  column_starts.push_back(validity_offset);
-
-  return fixed_width_size_per_row;
-}
-
-//#define DEBUG
-
-std::vector<detail::block_info> build_validity_block_infos(
-  size_type const& num_columns,
-  size_type const& num_rows,
-  size_type const& shmem_limit_per_block,
-  std::vector<row_batch> const& row_batches)
-{
-  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-  auto const column_stride            = align_offset(
-    [&]() {
-      if (desired_rows_and_columns > num_columns) {
-        // not many columns, group it into 8s and ship it off
-        return std::min(8, num_columns);
-      } else {
-        return util::round_down_safe(desired_rows_and_columns, 8);
-      }
-    }(),
-    8);
-  // we fit as much as we can given the column stride
-  // note that an element in the table takes just 1 bit, but a row with a single
-  // element still takes 8 bytes!
-  auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
-  auto const row_stride    = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
-
-  std::vector<detail::block_info> validity_block_infos;
-  for (int col = 0; col < num_columns; col += column_stride) {
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int row                      = 0;
-    while (row < num_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(row_stride, rows_left_in_batch);
-
-      validity_block_infos.emplace_back(detail::block_info{
-        col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
-      row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  }
-
-  return validity_block_infos;
-}
-
-std::vector<block_info> build_block_infos(std::vector<size_type> const& column_sizes,
-                                          std::vector<size_type> const& column_starts,
-                                          std::vector<row_batch> const& row_batches,
-                                          size_type const total_number_of_rows,
-                                          size_type const& shmem_limit_per_block)
-{
-  std::vector<block_info> block_infos;
-
-  // block infos are organized with the windows going "down" the columns
-  // this provides the most coalescing of memory access
-  int current_window_width     = 0;
-  int current_window_start_col = 0;
-
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
-                        int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int i                        = 0;
-    while (i < total_number_of_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(detail::block_info{
-        start_col,
-        current_window_start_row,
-        end_col,
-        std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
-        current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
-  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-  // would be memory cache line sized access, but since other blocks will read/write the edges
-  // this may not turn out to be overly important. For now, we will attempt to build a square
-  // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
-  // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
-  // trick is that it's in bytes, not rows or columns.
-  size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-  int const window_height            = std::clamp(
-    util::round_up_safe<int>(
-      std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
-               total_number_of_rows),
-      32),
-    1,
-    row_batches[0].row_count);
-#if defined(DEBUG)
-  printf(
-    "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d "
-    "and num_rows is %d, batch row count is %d "
-    "- which makes window height "
-    "%d - admin size is %lu\n",
-    optimal_square_len,
-    (int)column_sizes.size(),
-    optimal_square_len / column_sizes[0],
-    total_number_of_rows,
-    row_batches[0].row_count,
-    window_height,
-    column_sizes.size() * sizeof(size_type) * 2);
-#endif
-
-  auto calc_admin_data_size = [](int num_cols) -> size_type {
-    // admin data is the column sizes and column start information.
-    // this is copied to shared memory as well and needs to be accounted for
-    // in the window calculation.
-    return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
-  };
-
-  int row_size = 0;
-
-  // march each column and build the blocks of appropriate sizes
-  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
-    auto const col_size = column_sizes[col];
-
-    // align size for this type
-    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
-    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
-    auto row_size_with_this_col  = row_size_aligned + col_size;
-    auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
-
-    if (row_size_with_end_pad * window_height +
-          calc_admin_data_size(col - current_window_start_col) >
-        shmem_limit_per_block) {
-#if defined(DEBUG)
-      printf(
-        "row size with end pad is %d and admin data is %d, which adds up to %d and that is too "
-        "large for shmem block of %d\n",
-        row_size_with_end_pad,
-        calc_admin_data_size(col - current_window_start_col),
-        row_size_with_end_pad * window_height +
-          calc_admin_data_size(col - current_window_start_col),
-        shmem_limit_per_block);
-      printf(
-        "Window size %d too large at column %d, admin size is %d, bumping back to build windows "
-        "of "
-        "size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is "
-        "%d) "
-        "for shared mem size %d\n",
-        row_size_with_end_pad * window_height,
-        col,
-        calc_admin_data_size(col - current_window_start_col),
-        row_size * window_height,
-        current_window_start_col,
-        col - 1,
-        window_height,
-        row_size_with_end_pad,
-        row_size,
-        row_size_aligned,
-        shmem_limit_per_block);
-#endif
-      // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
-      row_size =
-        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-#if defined(DEBUG)
-      printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset "
-        "%d+%d "
-        "or %d)\n",
-        row_size,
-        col_size,
-        row_size + col_size,
-        column_starts[col - 1],
-        column_sizes[col - 1],
-        column_starts[col - 1] + column_sizes[col - 1]);
-#endif
-      row_size += col_size;  // alignment required for shared memory window boundary to match
+ #include <algorithm>
+ #include <iostream>
+ #include <iterator>
+ #include <limits>
+ #include <tuple>
+ 
+ #include <cooperative_groups.h>
+ #include <type_traits>
+ 
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ #include <cuda/barrier>
+ #endif
+ 
+ #include <cudf/column/column_factories.hpp>
+ #include <cudf/detail/iterator.cuh>
+ #include <cudf/detail/sequence.hpp>
+ #include <cudf/detail/utilities/cuda.cuh>
+ #include <cudf/detail/utilities/integer_utils.hpp>
+ #include <cudf/detail/utilities/vector_factories.hpp>
+ #include <cudf/lists/lists_column_device_view.cuh>
+ #include <cudf/row_conversion.hpp>
+ #include <cudf/scalar/scalar_factories.hpp>
+ #include <cudf/table/table.hpp>
+ #include <cudf/types.hpp>
+ #include <cudf/utilities/bit.hpp>
+ #include <cudf/utilities/error.hpp>
+ #include <cudf/utilities/traits.hpp>
+ #include <rmm/cuda_stream_view.hpp>
+ #include <rmm/device_buffer.hpp>
+ #include <rmm/device_uvector.hpp>
+ #include <rmm/exec_policy.hpp>
+ #include <thrust/binary_search.h>
+ #include <thrust/iterator/counting_iterator.h>
+ #include <thrust/iterator/transform_iterator.h>
+ 
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8;
+ constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
+ constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
+ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
+ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
+ #endif
+ 
+ using cudf::detail::make_device_uvector_async;
+ using rmm::device_uvector;
+ namespace cudf {
+ 
+ namespace detail {
+ 
+ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) {
+   return (offset + alignment - 1) & ~(alignment - 1);
+ }
+ 
+ __global__ void copy_from_rows_fixed_width_optimized(
+     const cudf::size_type num_rows, const cudf::size_type num_columns,
+     const cudf::size_type row_size, const cudf::size_type *input_offset_in_row,
+     const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm,
+     const int8_t *input_data) {
+   // We are going to copy the data in two passes.
+   // The first pass copies a chunk of data into shared memory.
+   // The second pass copies that chunk from shared memory out to the final location.
+ 
+   // Because shared memory is limited we copy a subset of the rows at a time.
+   // For simplicity we will refer to this as a row_group
+ 
+   // In practice we have found writing more than 4 columns of data per thread
+   // results in performance loss. As such we are using a 2 dimensional
+   // kernel in terms of threads, but not in terms of blocks. Columns are
+   // controlled by the y dimension (there is no y dimension in blocks). Rows
+   // are controlled by the x dimension (there are multiple blocks in the x
+   // dimension).
+ 
+   cudf::size_type rows_per_group = blockDim.x;
+   cudf::size_type row_group_start = blockIdx.x;
+   cudf::size_type row_group_stride = gridDim.x;
+   cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+ 
+   extern __shared__ int8_t shared_data[];
+ 
+   // Because we are copying fixed width only data and we stride the rows
+   // this thread will always start copying from shared data in the same place
+   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
+   int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+ 
+   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+        row_group_index += row_group_stride) {
+     // Step 1: Copy the data into shared memory
+     // We know row_size is always aligned with and a multiple of int64_t;
+     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
+     const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
+ 
+     cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
+     cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
+     cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
+     if (row_index_end > num_rows) {
+       row_index_end = num_rows;
+     }
+     cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+     cudf::size_type shared_length = row_size * num_rows_in_group;
+ 
+     cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
+ 
+     cudf::size_type start_input_index =
+         (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+ 
+     for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
+          shared_index += shared_output_stride) {
+       long_shared[shared_index] = long_input[start_input_index + shared_index];
+     }
+     // Wait for all of the data to be in shared memory
+     __syncthreads();
+ 
+     // Step 2 copy the data back out
+ 
+     // Within the row group there should be 1 thread for each row.  This is a
+     // requirement for launching the kernel
+     cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
+     // But we might not use all of the threads if the number of rows does not go
+     // evenly into the thread count. We don't want those threads to exit yet
+     // because we may need them to copy data in for the next row group.
+     uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
+     if (row_index < num_rows) {
+       cudf::size_type col_index_start = threadIdx.y;
+       cudf::size_type col_index_stride = blockDim.y;
+       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+            col_index += col_index_stride) {
+         cudf::size_type col_size = num_bytes[col_index];
+         const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
+         int8_t *col_output = output_data[col_index];
+         switch (col_size) {
+           case 1: {
+             col_output[row_index] = *col_tmp;
+             break;
+           }
+           case 2: {
+             int16_t *short_col_output = reinterpret_cast<int16_t *>(col_output);
+             short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
+             break;
+           }
+           case 4: {
+             int32_t *int_col_output = reinterpret_cast<int32_t *>(col_output);
+             int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
+             break;
+           }
+           case 8: {
+             int64_t *long_col_output = reinterpret_cast<int64_t *>(col_output);
+             long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
+             break;
+           }
+           default: {
+             cudf::size_type output_offset = col_size * row_index;
+             // TODO this should just not be supported for fixed width columns, but just in case...
+             for (cudf::size_type b = 0; b < col_size; b++) {
+               col_output[b + output_offset] = col_tmp[b];
+             }
+             break;
+           }
+         }
+ 
+         cudf::bitmask_type *nm = output_nm[col_index];
+         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+         cudf::size_type byte_bit_offset = col_index % 8;
+         int predicate = *valid_byte & (1 << byte_bit_offset);
+         uint32_t bitmask = __ballot_sync(active_mask, predicate);
+         if (row_index % 32 == 0) {
+           nm[word_index(row_index)] = bitmask;
+         }
+       } // end column loop
+     }   // end row copy
+     // wait for the row_group to be totally copied before starting on the next row group
+     __syncthreads();
+   }
+ }
+ 
+ __global__ void copy_to_rows_fixed_width_optimized(
+     const cudf::size_type start_row, const cudf::size_type num_rows,
+     const cudf::size_type num_columns, const cudf::size_type row_size,
+     const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes,
+     const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) {
+   // We are going to copy the data in two passes.
+   // The first pass copies a chunk of data into shared memory.
+   // The second pass copies that chunk from shared memory out to the final location.
+ 
+   // Because shared memory is limited we copy a subset of the rows at a time.
+   // We do not support copying a subset of the columns in a row yet, so we don't
+   // currently support a row that is wider than shared memory.
+   // For simplicity we will refer to this as a row_group
+ 
+   // In practice we have found reading more than 4 columns of data per thread
+   // results in performance loss. As such we are using a 2 dimensional
+   // kernel in terms of threads, but not in terms of blocks. Columns are
+   // controlled by the y dimension (there is no y dimension in blocks). Rows
+   // are controlled by the x dimension (there are multiple blocks in the x
+   // dimension).
+ 
+   cudf::size_type rows_per_group = blockDim.x;
+   cudf::size_type row_group_start = blockIdx.x;
+   cudf::size_type row_group_stride = gridDim.x;
+   cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+ 
+   extern __shared__ int8_t shared_data[];
+ 
+   // Because we are copying fixed width only data and we stride the rows
+   // this thread will always start copying to shared data in the same place
+   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
+   int8_t *row_vld_tmp =
+       &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+ 
+   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+        row_group_index += row_group_stride) {
+     // Within the row group there should be 1 thread for each row.  This is a
+     // requirement for launching the kernel
+     cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
+     // But we might not use all of the threads if the number of rows does not go
+     // evenly into the thread count. We don't want those threads to exit yet
+     // because we may need them to copy data back out.
+     if (row_index < (start_row + num_rows)) {
+       cudf::size_type col_index_start = threadIdx.y;
+       cudf::size_type col_index_stride = blockDim.y;
+       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+            col_index += col_index_stride) {
+         cudf::size_type col_size = num_bytes[col_index];
+         int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
+         const int8_t *col_input = input_data[col_index];
+         switch (col_size) {
+           case 1: {
+             *col_tmp = col_input[row_index];
+             break;
+           }
+           case 2: {
+             const int16_t *short_col_input = reinterpret_cast<const int16_t *>(col_input);
+             *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
+             break;
+           }
+           case 4: {
+             const int32_t *int_col_input = reinterpret_cast<const int32_t *>(col_input);
+             *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
+             break;
+           }
+           case 8: {
+             const int64_t *long_col_input = reinterpret_cast<const int64_t *>(col_input);
+             *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
+             break;
+           }
+           default: {
+             cudf::size_type input_offset = col_size * row_index;
+             // TODO this should just not be supported for fixed width columns, but just in case...
+             for (cudf::size_type b = 0; b < col_size; b++) {
+               col_tmp[b] = col_input[b + input_offset];
+             }
+             break;
+           }
+         }
+         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
+         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
+         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+         cudf::size_type byte_bit_offset = col_index % 8;
+         uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
+         int32_t *valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
+         cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
+         // Now copy validity for the column
+         if (input_nm[col_index]) {
+           if (bit_is_set(input_nm[col_index], row_index)) {
+             atomicOr_block(valid_int, 1 << int_bit_offset);
+           } else {
+             atomicAnd_block(valid_int, ~(1 << int_bit_offset));
+           }
+         } else {
+           // It is valid so just set the bit
+           atomicOr_block(valid_int, 1 << int_bit_offset);
+         }
+       } // end column loop
+     }   // end row copy
+     // wait for the row_group to be totally copied into shared memory
+     __syncthreads();
+ 
+     // Step 2: Copy the data back out
+     // We know row_size is always aligned with and a multiple of int64_t;
+     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
+     int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
+ 
+     cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
+     cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
+     cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
+     if (row_index_end > num_rows) {
+       row_index_end = num_rows;
+     }
+     cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+     cudf::size_type shared_length = row_size * num_rows_in_group;
+ 
+     cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
+ 
+     cudf::size_type start_output_index =
+         (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+ 
+     for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
+          shared_index += shared_input_stride) {
+       long_output[start_output_index + shared_index] = long_shared[shared_index];
+     }
+     __syncthreads();
+     // Go for the next round
+   }
+ }
+ 
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ 
+ struct block_info {
+   int start_col;
+   int start_row;
+   int end_col;
+   int end_row;
+   int buffer_num;
+ 
+   __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
+                                                     size_type const *const col_sizes) const {
+     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
+   }
+   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
+ 
+   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
+ };
+ 
+ // When building the columns to return, we have to be mindful of the offset limit in cudf.
+ // It is 32-bit and these data columns are capable of surpassing that easily. The data should
+ // not be cut off exactly at the limit though due to the validity buffers. The most efficient
+ // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+ // we keep track of the cut points for the validity, which we call row batches. If the row
+ // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+ // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+ // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+ // the outgoing data stream.
+ struct row_batch {
+   size_type num_bytes;
+   size_type row_count;
+ };
+ 
+ /**
+  * @brief copy data from cudf columns into x format, which is row-based
+  *
+  * @param num_rows total number of rows in the table
+  * @param num_columns total number of columns in the table
+  * @param input_data pointer to raw table data
+  * @param input_nm pointer to validity data
+  * @param col_sizes array of sizes for each element in a column - one per column
+  * @param col_offsets offset into input data row for each column's start
+  * @param block_infos information about the blocks of work
+  * @param row_offsets offset to a specific row in the input data
+  * @param output_data pointer to output data
+  *
+  */
+ __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
+                              const size_type shmem_used_per_block, const size_type num_block_infos,
+                              const int8_t **input_data, const size_type *col_sizes,
+                              const size_type *col_offsets, const block_info *block_infos,
+                              const size_type *row_offsets, int8_t **output_data) {
+   // We are going to copy the data in two passes.
+   // The first pass copies a chunk of data into shared memory.
+   // The second pass copies that chunk from shared memory out to the final location.
+ 
+   // Because shared memory is limited we copy a subset of the rows at a time.
+   // This has been broken up for us in the block_info struct, so we don't have
+   // any calculation to do here, but it is important to note.
+ 
+   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+   auto group = cooperative_groups::this_thread_block();
+   extern __shared__ int8_t shared_data[];
+   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+ 
+   __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+   if (group.thread_rank() == 0) {
+     for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+       init(&block_barrier[i], group.size());
+     }
+   }
+ 
+   group.sync();
+ 
+   auto const blocks_remaining =
+       std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
+                (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS);
+ 
+   size_t fetch;
+   size_t subset;
+   for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+     // Fetch ahead up to stages_count subsets
+     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
+       auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch];
+       auto const num_fetch_cols = fetch_block.num_cols();
+       auto const num_fetch_rows = fetch_block.num_rows();
+       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
+       auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
+       auto const starting_column_offset = col_offsets[fetch_block.start_col];
+       auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
+ 
+       // wait for the last use of the memory to be completed
+       if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) {
+         fetch_barrier.arrive_and_wait();
+       }
+ 
+       // to do the copy we need to do n column copies followed by m element copies OR
+       // we have to do m element copies followed by r row copies. When going from column
+       // to row it is much easier to copy by elements first otherwise we would need a running
+       // total of the column sizes for our block, which isn't readily available. This makes it
+       // more appealing to copy element-wise from input data into shared matching the end layout
+       // and do row-based memcopies out.
+ 
+       auto const shared_buffer_base = shared[fetch % stages_count];
+       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
+         auto const relative_col = el / num_fetch_rows;
+         auto const relative_row = el % num_fetch_rows;
+         auto const absolute_col = relative_col + fetch_block.start_col;
+         auto const absolute_row = relative_row + fetch_block.start_row;
+         auto const col_size = col_sizes[absolute_col];
+         auto const col_offset = col_offsets[absolute_col];
+         auto const relative_col_offset = col_offset - starting_column_offset;
+ 
+         auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
+         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
+ 
+         // copy the element from global memory
+         switch (col_size) {
+           case 2:
+             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                                cuda::aligned_size_t<2>(col_size), fetch_barrier);
+             break;
+           case 4:
+             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                                cuda::aligned_size_t<4>(col_size), fetch_barrier);
+             break;
+           case 8:
+             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                                cuda::aligned_size_t<8>(col_size), fetch_barrier);
+             break;
+           default:
+             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size,
+                                fetch_barrier);
+             break;
+         }
+       }
+     }
+ 
+     auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
+     subset_barrier.arrive_and_wait();
+ 
+     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
+     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
+     auto const column_offset = col_offsets[block.start_col];
+     auto const block_output_buffer = output_data[block.buffer_num];
+ 
+     // copy entire rows to final dest
+     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
+          absolute_row += blockDim.x) {
+       auto const relative_row = absolute_row - block.start_row;
+       auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset;
+       auto const shared_offset = block_row_size * relative_row;
+ 
+       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset],
+                          cuda::aligned_size_t<8>(block_row_size), subset_barrier);
+     }
+   }
+ 
+   // wait on the last copies to complete
+   for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+     block_barrier[i].arrive_and_wait();
+   }
+ }
+ 
+ /**
+  * @brief copy data from row-based format to cudf columns
+  *
+  * @param num_rows total number of rows in the table
+  * @param num_columns total number of columns in the table
+  * @param shmem_used_per_block amount of shared memory that is used by a block
+  * @param offsets
+  * @param output_data pointer to output data, partitioned by data size
+  * @param validity_offsets offset into input data row for validity data
+  * @param block_infos information about the blocks of work
+  * @param num_block_infos number of infos in blocks array
+  * @param input_data pointer to input data
+  *
+  */
+ __global__ void copy_validity_to_rows(
+     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
+     const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
+     const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
+   extern __shared__ int8_t shared_data[];
+   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+       shared_data, shared_data + shmem_used_per_block / 2};
+ 
+   // per conversation with DaveB
+   // each thread of warp reads a single int32 of validity - so we read 128 bytes
+   // then ballot_sync the bits and write the result to shmem
+   // after we fill shared mem memcpy it out in a blob.
+   // probably need knobs for number of rows vs columns to balance read/write
+   auto group = cooperative_groups::this_thread_block();
+ 
+   int const blocks_remaining =
+       std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+ 
+   __shared__ cuda::barrier<cuda::thread_scope_block>
+       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+   if (group.thread_rank() == 0) {
+     for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+       init(&shared_block_barriers[i], group.size());
+     }
+   }
+ 
+   group.sync();
+ 
+   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+     if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+       shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
+           .arrive_and_wait();
+     }
+     int8_t *this_shared_block = shared_blocks[validity_block % 2];
+     auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+ 
+     auto const num_block_cols = block.num_cols();
+     auto const num_block_rows = block.num_rows();
+ 
+     auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
+     auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
+     auto const validity_data_row_length =
+         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
+     auto const total_sections = num_sections_x * num_sections_y;
+ 
+     int const warp_id = threadIdx.x / detail::warp_size;
+     int const lane_id = threadIdx.x % detail::warp_size;
+     auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+ 
+     // the block is divided into sections. A warp operates on a section at a time.
+     for (int my_section_idx = warp_id; my_section_idx < total_sections;
+          my_section_idx += warps_per_block) {
+       // convert to rows and cols
+       auto const section_x = my_section_idx % num_sections_x;
+       auto const section_y = my_section_idx / num_sections_x;
+       auto const relative_col = section_x * 32 + lane_id;
+       auto const relative_row = section_y * 32;
+       auto const absolute_col = relative_col + block.start_col;
+       auto const absolute_row = relative_row + block.start_row;
+       auto const cols_left = num_columns - absolute_col;
+       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
+ 
+       if (absolute_col < num_columns) {
+         auto my_data = input_nm[absolute_col] != nullptr ?
+                            input_nm[absolute_col][absolute_row / 32] :
+                            std::numeric_limits<uint32_t>::max();
+ 
+         // every thread that is participating in the warp has a byte, but it's column-based
+         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
+         // make the bytes we actually write.
+         bitmask_type dw_mask = 1;
+         for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
+           auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
+           // lead thread in each warp writes data
+           auto const validity_write_offset =
+               validity_data_row_length * (relative_row + i) + relative_col / 8;
+           if (threadIdx.x % detail::warp_size == 0) {
+             if (cols_left <= 8) {
+               // write byte
+               this_shared_block[validity_write_offset] = validity_data & 0xFF;
+             } else if (cols_left <= 16) {
+               // write int16
+               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data & 0xFFFF;
+             } else if (cols_left <= 24) {
+               // write int16 and then int8
+               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data & 0xFFFF;
+               shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+             } else {
+               // write int32
+               *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data;
+             }
+           }
+         }
+       }
+     }
+ 
+     // make sure entire block has finished copy
+     group.sync();
+ 
+     auto const output_data_base =
+         output_data[block.buffer_num] + validity_offset + block.start_col / 8;
+ 
+     // now async memcpy the shared memory out to the final destination
+     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
+       auto const relative_row = row - block.start_row;
+       auto const output_ptr = output_data_base + row_offsets[row];
+       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+ 
+       cuda::memcpy_async(
+           output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes,
+           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+     }
+   }
+ 
+   // wait for last blocks of data to arrive
+   for (int validity_block = 0;
+        validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+        ++validity_block) {
+     shared_block_barriers[validity_block].arrive_and_wait();
+   }
+ }
+ 
+ static __device__ std::tuple<size_type, size_type>
+ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) {
+   auto const col_size_bytes = num_cols * col_size_size;
+   auto const col_offset_bytes = num_cols * col_offset_size;
+ 
+   return {col_size_bytes, col_offset_bytes};
+ }
+ 
+ /**
+  * @brief copy data from row-based format to cudf columns
+  *
+  * @param num_rows total number of rows in the table
+  * @param num_columns total number of columns in the table
+  * @param shmem_used_per_block amount of shared memory that is used by a block
+  * @param row_offsets
+  * @param output_data
+  * @param output_nm
+  * @param col_sizes array of sizes for each element in a column - one per column
+  * @param col_offsets offset into input data row for each column's start
+  * @param block_infos information about the blocks of work
+  * @param input_data pointer to input data
+  *
+  */
+ __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
+                                 const size_type shmem_used_per_block, const size_type *row_offsets,
+                                 int8_t **output_data, const size_type *_col_sizes,
+                                 const size_type *_col_offsets, const block_info *block_infos,
+                                 const size_type num_block_infos, const int8_t *input_data) {
+   // We are going to copy the data in two passes.
+   // The first pass copies a chunk of data into shared memory.
+   // The second pass copies that chunk from shared memory out to the final location.
+ 
+   // Because shared memory is limited we copy a subset of the rows at a time.
+   // This has been broken up for us in the block_info struct, so we don't have
+   // any calculation to do here, but it is important to note.
+ 
+   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
+   // to shared memory for each of the blocks that we work on
+ 
+   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+   auto group = cooperative_groups::this_thread_block();
+   extern __shared__ int8_t shared_data[];
+   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+ 
+   __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+   if (group.thread_rank() == 0) {
+     for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+       init(&block_barrier[i], group.size());
+     }
+   }
+ 
+   group.sync();
+ 
+   auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
+                                    (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
+ 
+   size_t fetch_index;
+   size_t processing_index;
+   for (processing_index = fetch_index = 0; processing_index < blocks_remaining;
+        ++processing_index) {
+     // Fetch ahead up to stages_count groups
+     for (; fetch_index < static_cast<size_t>(blocks_remaining) &&
+            fetch_index < (processing_index + stages_count);
+          ++fetch_index) {
+       auto const fetch_block =
+           block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index];
+       auto const fetch_block_start_row = fetch_block.start_row;
+       auto const fetch_block_end_row = fetch_block.end_row;
+       auto const starting_col_offset = _col_offsets[fetch_block.start_col];
+       auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes);
+       auto const num_fetch_cols = fetch_block.num_cols();
+       auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+           sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols);
+       auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+ 
+       // if we have fetched all buffers, we need to wait for processing
+       // to complete on them before we can use them again
+       if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
+         fetch_barrier.arrive_and_wait();
+       }
+ 
+       auto shared_row_offset = 0;
+       // copy the data for column sizes
+       cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
+                          &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
+       shared_row_offset += col_size_bytes;
+       // copy the data for column offsets
+       cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
+                          &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
+       shared_row_offset += col_offset_bytes;
+       shared_row_offset = align_offset(shared_row_offset, 8);
+ 
+       for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
+            row <= fetch_block_end_row; row += blockDim.x) {
+         auto shared_offset =
+             (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+         // copy the main
+         cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset],
+                            &input_data[row_offsets[row] + starting_col_offset],
+                            fetch_block_row_size, fetch_barrier);
+       }
+     }
+ 
+     auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+ 
+     // ensure our data is ready
+     processing_barrier.arrive_and_wait();
+ 
+     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index];
+     auto const rows_in_block = block.num_rows();
+     auto const cols_in_block = block.num_cols();
+ 
+     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+         sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block);
+     auto shared_col_sizes = reinterpret_cast<size_type *>(shared[processing_index % stages_count]);
+     auto shared_col_offsets =
+         reinterpret_cast<size_type *>(&shared[processing_index % stages_count][col_size_bytes]);
+ 
+     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
+ 
+     auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes);
+ 
+     // now we copy from shared memory to final destination.
+     // the data is laid out in rows in shared memory, so the reads
+     // for a column will be "vertical". Because of this and the different
+     // sizes for each column, this portion is handled on row/column basis.
+     // to prevent each thread working on a single row and also to ensure
+     // that all threads can do work in the case of more threads than rows,
+     // we do a global index instead of a double for loop with col/row.
+     for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+       auto const relative_col = index % cols_in_block;
+       auto const relative_row = index / cols_in_block;
+       auto const absolute_col = relative_col + block.start_col;
+       auto const absolute_row = relative_row + block.start_row;
+ 
+       auto const shared_memory_row_offset = block_row_size * relative_row;
+       auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
+                                         shared_memory_row_offset + shared_row_offset;
+       auto const column_size = shared_col_sizes[relative_col];
+ 
+       int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset];
+       int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
+ 
+       cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier);
+     }
+     group.sync();
+   }
+ 
+   // wait on the last copies to complete
+   for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+     block_barrier[i].arrive_and_wait();
+   }
+ }
+ 
+ /**
+  * @brief copy data from row-based format to cudf columns
+  *
+  * @param num_rows total number of rows in the table
+  * @param num_columns total number of columns in the table
+  * @param shmem_used_per_block amount of shared memory that is used by a block
+  * @param offsets
+  * @param output_nm
+  * @param validity_offsets offset into input data row for validity data
+  * @param block_infos information about the blocks of work
+  * @param num_block_infos number of infos in blocks array
+  * @param input_data pointer to input data
+  *
+  */
+ __global__ void copy_validity_from_rows(
+     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
+     const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
+     const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
+   extern __shared__ int8_t shared_data[];
+   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+       shared_data, shared_data + shmem_used_per_block / 2};
+ 
+   // per conversation with DaveB
+   // each thread of warp reads a single byte of validity - so we read 32 bytes
+   // then ballot_sync the bits and write the result to shmem
+   // after we fill shared mem memcpy it out in a blob.
+   // probably need knobs for number of rows vs columns to balance read/write
+   auto group = cooperative_groups::this_thread_block();
+ 
+   int const blocks_remaining =
+       std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+ 
+   __shared__ cuda::barrier<cuda::thread_scope_block>
+       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+   if (group.thread_rank() == 0) {
+     for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+       init(&shared_block_barriers[i], group.size());
+     }
+   }
+ 
+   group.sync();
+ 
+   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+     auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+     if (validity_block != validity_index) {
+       shared_block_barriers[validity_index].arrive_and_wait();
+     }
+     int8_t *this_shared_block = shared_blocks[validity_block % 2];
+     auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+     auto const block_start_col = block.start_col;
+     auto const block_start_row = block.start_row;
+     auto const num_block_cols = block.num_cols();
+     auto const num_block_rows = block.num_rows();
+     auto const num_sections_x = (num_block_cols + 7) / 8;
+     auto const num_sections_y = (num_block_rows + 31) / 32;
+     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
+     auto const total_sections = num_sections_x * num_sections_y;
+     int const warp_id = threadIdx.x / detail::warp_size;
+     int const lane_id = threadIdx.x % detail::warp_size;
+     auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+ 
+     // the block is divided into sections. A warp operates on a section at a time.
+     for (int my_section_idx = warp_id; my_section_idx < total_sections;
+          my_section_idx += warps_per_block) {
+       // convert to rows and cols
+       auto const section_x = my_section_idx % num_sections_x;
+       auto const section_y = my_section_idx / num_sections_x;
+       auto const relative_col = section_x * 8;
+       auto const relative_row = section_y * 32 + lane_id;
+       auto const absolute_col = relative_col + block_start_col;
+       auto const absolute_row = relative_row + block_start_row;
+       auto const rows_left = num_rows - absolute_row;
+ 
+       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
+ 
+       if (absolute_row < num_rows) {
+         auto const my_byte =
+             input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
+ 
+         // so every thread that is participating in the warp has a byte, but it's row-based
+         // data and we need it in column-based. So we shiffle the bits around to make
+         // the bytes we actually write.
+         for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
+              ++i, byte_mask <<= 1) {
+           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+           // lead thread in each warp writes data
+           if (threadIdx.x % detail::warp_size == 0) {
+             auto const validity_write_offset =
+                 validity_data_col_length * (relative_col + i) + relative_row / 8;
+ 
+             if (rows_left <= 8) {
+               // write byte
+               this_shared_block[validity_write_offset] = validity_data & 0xFF;
+             } else if (rows_left <= 16) {
+               // write int16
+               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data & 0xFFFF;
+             } else if (rows_left <= 24) {
+               // write int16 and then int8
+               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data & 0xFFFF;
+               shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+             } else {
+               // write int32
+               *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data;
+             }
+           }
+         }
+       }
+     }
+ 
+     // make sure entire block has finished copy
+     group.sync();
+ 
+     // now async memcpy the shared
+     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
+       auto const relative_col = col - block.start_col;
+ 
+       cuda::memcpy_async(
+           output_nm[col] + word_index(block_start_row),
+           &this_shared_block[validity_data_col_length * relative_col],
+           util::div_rounding_up_unsafe(num_block_rows, 8),
+           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+     }
+   }
+ 
+   // wait for last blocks of data to arrive
+   auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ?
+                                       NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED :
+                                       blocks_remaining;
+   for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
+     shared_block_barriers[validity_block].arrive_and_wait();
+   }
+ }
+ 
+ #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ 
+ /**
+  * Calculate the dimensions of the kernel for fixed width only columns.
+  * @param [in] num_columns the number of columns being copied.
+  * @param [in] num_rows the number of rows being copied.
+  * @param [in] size_per_row the size each row takes up when padded.
+  * @param [out] blocks the size of the blocks for the kernel
+  * @param [out] threads the size of the threads for the kernel
+  * @return the size in bytes of shared memory needed for each block.
+  */
+ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
+                                         const cudf::size_type num_rows,
+                                         const cudf::size_type size_per_row, dim3 &blocks,
+                                         dim3 &threads) {
+   // We have found speed degrades when a thread handles more than 4 columns.
+   // Each block is 2 dimensional. The y dimension indicates the columns.
+   // We limit this to 32 threads in the y dimension so we can still
+   // have at least 32 threads in the x dimension (1 warp) which should
+   // result in better coalescing of memory operations. We also
+   // want to guarantee that we are processing a multiple of 32 threads
+   // in the x dimension because we use atomic operations at the block
+   // level when writing validity data out to main memory, and that would
+   // need to change if we split a word of validity data between blocks.
+   int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4);
+   if (y_block_size > 32) {
+     y_block_size = 32;
+   }
+   int x_possible_block_size = 1024 / y_block_size;
+   // 48KB is the default setting for shared memory per block according to the cuda tutorials
+   // If someone configures the GPU to only have 16 KB this might not work.
+   int max_shared_size = 48 * 1024;
+   int max_block_size = max_shared_size / size_per_row;
+   // If we don't have enough shared memory there is no point in having more threads
+   // per block that will just sit idle
+   max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
+   // Make sure that the x dimension is a multiple of 32 this not only helps
+   // coalesce memory access it also lets us do a ballot sync for validity to write
+   // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
+   // dimension is associated with one or more warps, that should correspond to the validity
+   // words directly.
+   int block_size = (max_block_size / 32) * 32;
+   CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
+ 
+   int num_blocks = (num_rows + block_size - 1) / block_size;
+   if (num_blocks < 1) {
+     num_blocks = 1;
+   } else if (num_blocks > 10240) {
+     // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
+     // but in practice haveing too many can cause some overhead that I don't totally
+     // understand. Playing around with this haveing as little as 600 blocks appears
+     // to be able to saturate memory on V100, so this is an order of magnitude higher
+     // to try and future proof this a bit.
+     num_blocks = 10240;
+   }
+   blocks.x = num_blocks;
+   blocks.y = 1;
+   blocks.z = 1;
+   threads.x = block_size;
+   threads.y = y_block_size;
+   threads.z = 1;
+   return size_per_row * block_size;
+ }
+ 
+ /**
+  * When converting to rows it is possible that the size of the table was too big to fit
+  * in a single column. This creates an output column for a subset of the rows in a table
+  * going from start row and containing the next num_rows.  Most of the parameters passed
+  * into this function are common between runs and should be calculated once.
+  */
+ static std::unique_ptr<cudf::column>
+ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows,
+                             const cudf::size_type num_columns, const cudf::size_type size_per_row,
+                             rmm::device_uvector<cudf::size_type> &column_start,
+                             rmm::device_uvector<cudf::size_type> &column_size,
+                             rmm::device_uvector<const int8_t *> &input_data,
+                             rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
+                             const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row,
+                             rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+   int64_t total_allocation = size_per_row * num_rows;
+   // We made a mistake in the split somehow
+   CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
+ 
+   // Allocate and set the offsets row for the byte array
+   std::unique_ptr<cudf::column> offsets =
+       cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
+ 
+   std::unique_ptr<cudf::column> data = cudf::make_numeric_column(
+       cudf::data_type(cudf::type_id::INT8), static_cast<cudf::size_type>(total_allocation),
+       cudf::mask_state::UNALLOCATED, stream, mr);
+ 
+   dim3 blocks;
+   dim3 threads;
+   int shared_size =
+       detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+ 
+   copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
+       start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
+       input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
+ 
+   return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
+                                  rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
+ }
+ 
+ static cudf::data_type get_data_type(const cudf::column_view &v) {
+   return v.type();
+ }
+ 
+ static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
+   return std::all_of(schema.begin(), schema.end(),
+                      [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
+ }
+ 
+ /**
+  * Given a set of fixed width columns, calculate how the data will be laid out in memory.
+  * @param [in] schema the types of columns that need to be laid out.
+  * @param [out] column_start the byte offset where each column starts in the row.
+  * @param [out] column_size the size in bytes of the data for each columns in the row.
+  * @return the size in bytes each row needs.
+  */
+ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
+                                                  std::vector<cudf::size_type> &column_start,
+                                                  std::vector<cudf::size_type> &column_size) {
+   // We guarantee that the start of each column is 64-bit aligned so anything can go
+   // there, but to make the code simple we will still do an alignment for it.
+   int32_t at_offset = 0;
+   for (auto col = schema.begin(); col < schema.end(); col++) {
+     cudf::size_type s = cudf::size_of(*col);
+     column_size.emplace_back(s);
+     std::size_t allocation_needed = s;
+     std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
+     at_offset = align_offset(at_offset, alignment_needed);
+     column_start.emplace_back(at_offset);
+     at_offset += allocation_needed;
+   }
+ 
+   // Now we need to add in space for validity
+   // Eventually we can think about nullable vs not nullable, but for now we will just always add
+   // it in
+   int32_t validity_bytes_needed =
+       (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
+   // validity comes at the end and is byte aligned so we can pack more in.
+   at_offset += validity_bytes_needed;
+   // Now we need to pad the end so all rows are 64 bit aligned
+   return align_offset(at_offset, 8); // 8 bytes (64 bits)
+ }
+ 
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ 
+ template <typename iterator>
+ static size_type compute_column_information(iterator begin, iterator end,
+                                             std::vector<size_type> &column_starts,
+                                             std::vector<size_type> &column_sizes) //,
+ // std::function<void(T)> nested_type_cb)
+ {
+   size_type fixed_width_size_per_row = 0;
+   for (auto cv = begin; cv != end; ++cv) {
+     auto col_type = std::get<0>(*cv);
+     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+ 
+     //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+ 
+     // a list or string column will write a single uint64
+     // of data here for offset/length
+     auto col_size = nested_type ? 8 : size_of(col_type);
+ 
+     // align size for this type
+     std::size_t const alignment_needed = col_size; // They are the same for fixed width types
+     fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+     column_starts.push_back(fixed_width_size_per_row);
+     column_sizes.push_back(col_size);
+     fixed_width_size_per_row += col_size;
+   }
+ 
+   auto validity_offset = fixed_width_size_per_row;
+   column_starts.push_back(validity_offset);
+ 
+   return fixed_width_size_per_row;
+ }
+ 
+ std::vector<detail::block_info>
+ build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
+                            size_type const &shmem_limit_per_block,
+                            std::vector<row_batch> const &row_batches) {
+   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+   auto const column_stride = align_offset(
+       [&]() {
+         if (desired_rows_and_columns > num_columns) {
+           // not many columns, group it into 8s and ship it off
+           return std::min(8, num_columns);
+         } else {
+           return util::round_down_safe(desired_rows_and_columns, 8);
+         }
+       }(),
+       8);
+   // we fit as much as we can given the column stride
+   // note that an element in the table takes just 1 bit, but a row with a single
+   // element still takes 8 bytes!
+   auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
+   auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
+ 
+   std::vector<detail::block_info> validity_block_infos;
+   for (int col = 0; col < num_columns; col += column_stride) {
+     int current_window_row_batch = 0;
+     int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+     int row = 0;
+     while (row < num_rows) {
+       if (rows_left_in_batch == 0) {
+         current_window_row_batch++;
+         rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+       }
+       int const window_height = std::min(row_stride, rows_left_in_batch);
+ 
+       validity_block_infos.emplace_back(detail::block_info{
+           col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
+       row += window_height;
+       rows_left_in_batch -= window_height;
+     }
+   }
+ 
+   return validity_block_infos;
+ }
+ 
+ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
+                                           std::vector<size_type> const &column_starts,
+                                           std::vector<row_batch> const &row_batches,
+                                           size_type const total_number_of_rows,
+                                           size_type const &shmem_limit_per_block) {
+   std::vector<block_info> block_infos;
+ 
+   // block infos are organized with the windows going "down" the columns
+   // this provides the most coalescing of memory access
+   int current_window_width = 0;
+   int current_window_start_col = 0;
+ 
+   // build the blocks for a specific set of columns
+   auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
+                           int const start_col, int const end_col, int const desired_window_height) {
+     int current_window_start_row = 0;
+     int current_window_row_batch = 0;
+     int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+     int i = 0;
+     while (i < total_number_of_rows) {
+       if (rows_left_in_batch == 0) {
+         current_window_row_batch++;
+         rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+       }
+       int const window_height = std::min(desired_window_height, rows_left_in_batch);
+ 
+       block_infos.emplace_back(detail::block_info{
+           start_col, current_window_start_row, end_col,
+           std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
+           current_window_row_batch});
+ 
+       i += window_height;
+       current_window_start_row += window_height;
+       rows_left_in_batch -= window_height;
+     }
+   };
+ 
+   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+   // would be memory cache line sized access, but since other blocks will read/write the edges
+   // this may not turn out to be overly important. For now, we will attempt to build a square
+   // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
+   // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
+   // trick is that it's in bytes, not rows or columns.
+   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
+   int const window_height = std::clamp(
+       util::round_up_safe<int>(
+           std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
+                    total_number_of_rows),
+           32),
+       1, row_batches[0].row_count);
+ 
+   auto calc_admin_data_size = [](int num_cols) -> size_type {
+     // admin data is the column sizes and column start information.
+     // this is copied to shared memory as well and needs to be accounted for
+     // in the window calculation.
+     return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
+   };
+ 
+   int row_size = 0;
+ 
+   // march each column and build the blocks of appropriate sizes
+   for (unsigned int col = 0; col < column_sizes.size(); ++col) {
+     auto const col_size = column_sizes[col];
+ 
+     // align size for this type
+     std::size_t alignment_needed = col_size; // They are the same for fixed width types
+     auto row_size_aligned = detail::align_offset(row_size, alignment_needed);
+     auto row_size_with_this_col = row_size_aligned + col_size;
+     auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
+ 
+     if (row_size_with_end_pad * window_height +
+             calc_admin_data_size(col - current_window_start_col) >
+         shmem_limit_per_block) {
+       // too large, close this window, generate vertical blocks and restart
+       build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
+       row_size =
+           detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+       row_size += col_size; // alignment required for shared memory window boundary to match
                              // alignment of output row
-      current_window_start_col = col;
-      current_window_width     = 0;
-    } else {
-      row_size = row_size_with_this_col;
-      current_window_width++;
-    }
-  }
-
-  // build last set of blocks
-  if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
-  }
-
-  return block_infos;
-}
-
-#if defined(DEBUG)
-void pretty_print(uint64_t i)
-{
-  if (i > (1 * 1024 * 1024 * 1024)) {
-    printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
-  } else if (i > (1 * 1024 * 1024)) {
-    printf("%.2f MB", i / float(1 * 1024 * 1024));
-  } else if (i > (1 * 1024)) {
-    printf("%.2f KB", float(i / 1024));
-  } else {
-    printf("%lu Bytes", i);
-  }
-}
-#endif
-#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-}  // namespace detail
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const& tbl,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
-{
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
-  const size_type num_columns = tbl.num_columns();
-  const size_type num_rows    = tbl.num_rows();
-
-  int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
-  int total_shmem;
-  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
-
-#if defined(DEBUG) || 1
-  total_shmem -= 1024;
-#endif
-  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-
-#if defined(DEBUG)
-  size_t free, total;
-  cudaMemGetInfo(&free, &total);
-  printf("%lu/%lu Memory\n", free, total);
-#endif
-
-  // break up the work into blocks, which are a starting and ending row/col #.
-  // this window size is calculated based on the shared memory size available
-  // we want a single block to fill up the entire shared memory space available
-  // for the transpose-like conversion.
-
-  // There are two different processes going on here. The GPU conversion of the data
-  // and the writing of the data into the list of byte columns that are a maximum of
-  // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
-  // this limitation because the column must own the data inside and as a result it must be
-  // a distinct allocation for that column. Copying the data into these final buffers would
-  // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
-  // The windows are broken at the boundaries of specific rows based on the row sizes up
-  // to that point. These are row batches and they are decided first before building the
-  // windows so the windows can be properly cut around them.
-
-  // Get the pointers to the input columnar data ready
-  std::vector<int8_t const*> input_data;
-  std::vector<bitmask_type const*> input_nm;
-  input_data.reserve(num_columns);
-  input_nm.reserve(num_columns);
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv      = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
-
-  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-  auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
-
-  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
-  std::vector<size_type> column_sizes;  // byte size of each column
-  std::vector<size_type> column_starts;  // offset of column inside a row including alignment
-  std::vector<column_view>
-    variable_width_columns;  // list of the variable width columns in the table
-  row_sizes.reserve(num_rows);
-  row_offsets.reserve(num_rows);
-  column_sizes.reserve(num_columns);
-  column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
-
-  auto iter = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-      return std::make_tuple(tbl.column(i).type(), tbl.column(i));
-    });
-
-  size_type fixed_width_size_per_row = detail::compute_column_information(iter,
-                                                                          iter + num_columns,
-                                                                          column_starts,
-                                                                          column_sizes);  //,
-  //    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
-  /*  size_type fixed_width_size_per_row = 0;
-    for (int col = 0; col < num_columns; ++col) {
-      auto cv          = tbl.column(col);
-      auto col_type    = cv.type();
-      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-      if (nested_type) { variable_width_columns.push_back(cv); }
-
-      // a list or string column will write a single uint64
-      // of data here for offset/length
-      auto col_size = nested_type ? 8 : size_of(col_type);
-
-      // align size for this type
-      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-      column_starts.push_back(fixed_width_size_per_row);
-      column_sizes.push_back(col_size);
-      fixed_width_size_per_row += col_size;
-    }*/
-
-#if defined(DEBUG)
-  printf("validity offset will be %d + %d = %d\n",
-         column_starts.back(),
-         column_sizes.back(),
-         column_starts.back() + column_sizes.back());
-#endif
-
-  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
-  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-
-  std::vector<detail::row_batch> row_batches;
-
-  auto calculate_variable_width_row_data_size = [](int const row) {
-    // each level of variable-width data will add an offset/length
-    // uint64 of data. The first of which is inside the fixed-width
-    // data itself and needs to be aligned based on what is around
-    // that data. This is handled above with the fixed-width calculations
-    // for that reason. We may still need to add more of these offset/length
-    // combinations if the nesting is deeper than one level as these
-    // will be included in the variable-width data blob at the end of the
-    // row.
-    return 0;
-    /*      auto c = variable_width_columns[col];
-            while (true) {
-              auto col_offsets   = c.child(0).data<size_type>();
-              auto col_data_size = size_of(c.child(1).type());
-              std::size_t alignment_needed  = col_data_size;
-
-            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-            if (c.num_children() == 0) {
-              break;
-            }
-            c = c.child(1);
-          }
-          exclusive_scan([t](int row_index) {
-            size_type total_row_size = 0;
-            for (int i=0 i<t.num_columns(); ++i) {
-              // compute data prior to validity
-              data_size += compute_type_size();
-              // compute validity size
-              total_row_size += num_columns() / 8;
-              total_row_size = align(data_size + bit_size + variable_size);
-            }
-          }
-    */
-  };
-
-  uint64_t row_batch_size   = 0;
-  uint64_t total_table_size = 0;
-  size_type row_batch_rows  = 0;
-  uint64_t row_offset       = 0;
-
-  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
-  // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = num_bitmask_words(num_columns) * 4;
-  // thrust
-  for (int row = 0; row < num_rows; ++row) {
-    auto aligned_row_batch_size =
-      detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
-    row_sizes[row] = fixed_width_size_per_row;
-    // validity is byte aligned
-    row_sizes[row] += validity_size;
-    // variable width data is 8-byte aligned
-    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
-                     calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
-
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
-        (uint64_t)std::numeric_limits<size_type>::max()) {
-      // a new batch starts at the last 32-row boundary
-      row_batches.push_back(
-        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size         = 0;
-      row_batch_rows         = row_batch_rows & 31;
-      row_offset             = 0;
-      aligned_row_batch_size = 0;
-    }
-    row_offset = detail::align_offset(row_offset, 8);  // rows are 8 byte aligned
-    row_offsets.push_back(row_offset);
-    row_batch_size = aligned_row_batch_size + row_sizes[row];
-    row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8);  // rows are 8 byte aligned
-    total_table_size += row_sizes[row];
-    row_batch_rows++;
-  }
-  if (row_batch_size > 0) {
-    row_batches.push_back(
-      detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
-  }
-
-  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
-
-#if defined(DEBUG)
-  printf("%d rows and %d columns in table\n", num_rows, num_columns);
-  printf("%lu batches:\n", row_batches.size());
-  for (auto i = 0; i < (int)row_batches.size(); ++i) {
-    printf("%d: %d rows, ", i, row_batches[i].row_count);
-    detail::pretty_print(row_batches[i].num_bytes);
-    printf("\n");
-  }
-#endif
-
-  std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t*> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t*>(temp.data()));
-    output_buffers.push_back(std::move(temp));
-  }
-  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-
-  std::vector<detail::block_info> block_infos =
-    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-#if defined(DEBUG)
-  printf("%lu windows for %d columns, %d rows to fit in ",
-         block_infos.size(),
-         block_infos[0].end_col - block_infos[0].start_col + 1,
-         block_infos[0].end_row - block_infos[0].start_row);
-  detail::pretty_print(shmem_limit_per_block);
-  printf(" shared mem(");
-  detail::pretty_print(fixed_width_size_per_row);
-  printf("/row, %d columns, %d rows, ", num_columns, num_rows);
-  detail::pretty_print(total_table_size);
-  printf(" total):\n");
-#endif
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
-
-  // blast through the entire table and convert it
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS));
-  dim3 threads(256);
-
-#if defined(DEBUG)
-  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  detail::pretty_print(shmem_limit_per_block);
-  printf(" shared memory\n");
-#endif
-  detail::copy_from_columns<<<blocks, threads, total_shmem, stream.value()>>>(
-    num_rows,
-    num_columns,
-    shmem_limit_per_block,
-    block_infos.size(),
-    dev_input_data.data(),
-    dev_col_sizes.data(),
-    dev_col_starts.data(),
-    dev_block_infos.data(),
-    dev_row_offsets.data(),
-    reinterpret_cast<int8_t**>(dev_output_data.data()));
-
-  auto validity_block_infos =
-    build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
-
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
-  dim3 validity_blocks(
-    util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
-  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-#if defined(DEBUG)
-  printf("Launching validity kernel with %d blocks, for %lu validity blocks with %d threads, ",
-         validity_blocks.x,
-         validity_block_infos.size(),
-         validity_threads.x);
-  detail::pretty_print(total_shmem);
-  printf(" shared memory\n");
-#endif
-  detail::
-    copy_validity_from_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-      num_rows,
-      num_columns,
-      shmem_limit_per_block,
-      dev_row_offsets.data(),
-      dev_output_data.data(),
-      column_starts.back(),
-      dev_validity_block_infos.data(),
-      validity_block_infos.size(),
-      dev_input_nm.data());
-
-  // split up the output buffer into multiple buffers based on row batch sizes
-  // and create list of byte columns
-  int offset_offset = 0;
-  std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    // compute offsets for this row batch
-    std::vector<size_type> offset_vals;
-    offset_vals.reserve(row_batches[i].row_count + 1);
-    size_type cur_offset = 0;
-    offset_vals.push_back(cur_offset);
-    for (int row = 0; row < row_batches[i].row_count; ++row) {
-      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
-      offset_vals.push_back(cur_offset);
-    }
-    offset_offset += row_batches[i].row_count;
-
-    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
-    auto offsets     = std::make_unique<column>(
-      data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
-
-    auto data = std::make_unique<column>(
-      data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i]));
-
-    ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
-                                          std::move(offsets),
-                                          std::move(data),
-                                          0,
-                                          rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-                                          stream,
-                                          mr));
-  }
-
-  return ret;
-#else
-  CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
-  return {};
-#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-}
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
-  cudf::table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
-{
-  const cudf::size_type num_columns = tbl.num_columns();
-
-  std::vector<cudf::data_type> schema;
-  schema.resize(num_columns);
-  std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
-
-  if (detail::are_all_fixed_width(schema)) {
-    std::vector<cudf::size_type> column_start;
-    std::vector<cudf::size_type> column_size;
-
-    int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
-    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
-
-    int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
-    // Make the number of rows per batch a multiple of 32 so we don't have to worry about
-    // splitting validity at a specific row offset.  This might change in the future.
-    max_rows_per_batch = (max_rows_per_batch / 32) * 32;
-
-    cudf::size_type num_rows = tbl.num_rows();
-
-    // Get the pointers to the input columnar data ready
-    std::vector<const int8_t*> input_data;
-    std::vector<cudf::bitmask_type const*> input_nm;
-    for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
-      cudf::column_view cv = tbl.column(column_number);
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
-
-    using ScalarType = cudf::scalar_type_t<cudf::size_type>;
-    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    zero->set_valid_async(true, stream);
-    static_cast<ScalarType*>(zero.get())->set_value(0, stream);
-
-    auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    step->set_valid_async(true, stream);
-    static_cast<ScalarType*>(step.get())
-      ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
-
-    std::vector<std::unique_ptr<cudf::column>> ret;
-    for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
-      cudf::size_type row_count = num_rows - row_start;
-      row_count                 = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-      ret.emplace_back(detail::fixed_width_convert_to_rows(row_start,
-                                                           row_count,
-                                                           num_columns,
-                                                           size_per_row,
-                                                           dev_column_start,
-                                                           dev_column_size,
-                                                           dev_input_data,
-                                                           dev_input_nm,
-                                                           *zero,
-                                                           *step,
-                                                           stream,
-                                                           mr));
-    }
-
-    return ret;
-  } else {
-    CUDF_FAIL("Only fixed width types are currently supported");
-  }
-}
-
-std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& input,
-                                               std::vector<cudf::data_type> const& schema,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
-{
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-  // verify that the types are what we expect
-  cudf::column_view child = input.child();
-  cudf::type_id list_type = child.type().id();
-  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-               "Only a list of bytes is supported as input");
-
-  cudf::size_type num_columns = schema.size();
-  cudf::size_type num_rows    = input.parent().size();
-
-  int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
-  int total_shmem;
-  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
-
-#if defined(DEBUG) || 1
-  total_shmem -= 1024;
-#endif
-  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-
-  std::vector<cudf::size_type> column_starts;
-  std::vector<cudf::size_type> column_sizes;
-
-  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
-    return std::make_tuple(schema[i], nullptr);
-  });
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter, iter + num_columns, column_starts, column_sizes);  //, [](void *) {});
-
-  size_type validity_size = num_bitmask_words(num_columns) * 4;
-
-  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
-
-  // Ideally we would check that the offsets are all the same, etc. but for now
-  // this is probably fine
-  CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
-
-  // build the row_batches from the passed in list column
-  std::vector<detail::row_batch> row_batches;
-
-  row_batches.push_back(detail::row_batch{child.size(), num_rows});
-
-  // Allocate the columns we are going to write into
-  std::vector<std::unique_ptr<cudf::column>> output_columns;
-  std::vector<int8_t*> output_data;
-  std::vector<cudf::bitmask_type*> output_nm;
-  for (cudf::size_type i = 0; i < num_columns; i++) {
-    auto column = cudf::make_fixed_width_column(
-      schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
-    auto mut = column->mutable_view();
-    output_data.emplace_back(mut.data<int8_t>());
-    output_nm.emplace_back(mut.null_mask());
-    output_columns.emplace_back(std::move(column));
-  }
-
-  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-  auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
-
-  std::vector<detail::block_info> block_infos =
-    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
-
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
-#if defined(DEBUG)
-  dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size()));
-#else
-  dim3 threads(std::min(256, (int)child.size()));
-#endif
-#if defined(DEBUG)
-  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  detail::pretty_print(total_shmem);
-  printf(" shared memory\n");
-#endif
-  detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
-    num_rows,
-    num_columns,
-    shmem_limit_per_block,
-    input.offsets().data<size_type>(),
-    dev_output_data.data(),
-    dev_col_sizes.data(),
-    dev_col_starts.data(),
-    dev_block_infos.data(),
-    block_infos.size(),
-    child.data<int8_t>());
-
-  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-  auto const column_stride            = [&]() {
-    if (desired_rows_and_columns > num_columns) {
-      // not many columns, group it into 64s and ship it off
-      return std::min(64, num_columns);
-    } else {
-      return util::round_down_safe(desired_rows_and_columns, 8);
-    }
-  }();
-  auto const row_stride = [&]() {
-    // we fit as much as we can, we know the column stride now, so calculate the row
-    return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
-    /*    if (desired_rows_and_columns > num_rows) {
-          return std::min(32, num_rows);
-        } else {
-          return util::round_down_safe(desired_rows_and_columns, 32);
-        }*/
-  }();
-  std::vector<detail::block_info> validity_block_infos;
-  for (int col = 0; col < num_columns; col += column_stride) {
-    for (int row = 0; row < num_rows; row += row_stride) {
-      validity_block_infos.emplace_back(
-        detail::block_info{col,
-                           row,
-                           std::min(col + column_stride - 1, num_columns - 1),
-                           std::min(row + row_stride - 1, num_rows - 1)});
-    }
-  }
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
-  dim3 validity_blocks(
-    util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
-#if defined(DEBUG)
-  printf(
-    "Launching validity kernel with %d blocks, for %lu validity blocks, col stride %d and row "
-    "stride of %d with %d threads, ",
-    validity_blocks.x,
-    validity_block_infos.size(),
-    column_stride,
-    row_stride,
-    threads.x);
-  detail::pretty_print(total_shmem);
-  printf(" shared memory\n");
-#endif
-
-  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-  detail::
-    copy_validity_to_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-      num_rows,
-      num_columns,
-      shmem_limit_per_block,
-      input.offsets().data<size_type>(),
-      dev_output_nm.data(),
-      column_starts.back(),
-      dev_validity_block_infos.data(),
-      validity_block_infos.size(),
-      child.data<int8_t>());
-
-  return std::make_unique<cudf::table>(std::move(output_columns));
-#else
-  CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
-  return {};
-#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-}
-
-std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
-  cudf::lists_column_view const& input,
-  std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  // verify that the types are what we expect
-  cudf::column_view child = input.child();
-  cudf::type_id list_type = child.type().id();
-  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-               "Only a list of bytes is supported as input");
-
-  cudf::size_type num_columns = schema.size();
-
-  if (detail::are_all_fixed_width(schema)) {
-    std::vector<cudf::size_type> column_start;
-    std::vector<cudf::size_type> column_size;
-
-    cudf::size_type num_rows = input.parent().size();
-    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
-
-    // Ideally we would check that the offsets are all the same, etc. but for now
-    // this is probably fine
-    CUDF_EXPECTS(size_per_row * num_rows == child.size(),
-                 "The layout of the data appears to be off");
-    auto dev_column_start = make_device_uvector_async(column_start, stream);
-    auto dev_column_size  = make_device_uvector_async(column_size, stream);
-
-    // Allocate the columns we are going to write into
-    std::vector<std::unique_ptr<cudf::column>> output_columns;
-    std::vector<int8_t*> output_data;
-    std::vector<cudf::bitmask_type*> output_nm;
-    for (cudf::size_type i = 0; i < num_columns; i++) {
-      auto column = cudf::make_fixed_width_column(
-        schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
-      auto mut = column->mutable_view();
-      output_data.emplace_back(mut.data<int8_t>());
-      output_nm.emplace_back(mut.null_mask());
-      output_columns.emplace_back(std::move(column));
-    }
-
-    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
-
-    dim3 blocks;
-    dim3 threads;
-    int shared_size =
-      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
-
-    //    printf("Launching (%d, %d, %d) blocks, (%d, %d, %d) threads, with %d shared size\n",
-    //    blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shared_size);
-    //    printf("pointers are column_start: %p, column_size: %p, output_data: %p, output_nm: %p\n",
-    //    dev_column_start.data(), dev_column_size.data(), dev_output_data.data(),
-    //    dev_output_nm.data());
-    detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-      num_rows,
-      num_columns,
-      size_per_row,
-      dev_column_start.data(),
-      dev_column_size.data(),
-      dev_output_data.data(),
-      dev_output_nm.data(),
-      child.data<int8_t>());
-
-    return std::make_unique<cudf::table>(std::move(output_columns));
-  } else {
-    CUDF_FAIL("Only fixed width types are currently supported");
-  }
-}
-
-}  // namespace cudf
+       current_window_start_col = col;
+       current_window_width = 0;
+     } else {
+       row_size = row_size_with_this_col;
+       current_window_width++;
+     }
+   }
+ 
+   // build last set of blocks
+   if (current_window_width > 0) {
+     build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
+   }
+ 
+   return block_infos;
+ }
+ 
+ #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ 
+ } // namespace detail
+ 
+ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr) {
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+   // Potential optimization for window sizes.
+   const size_type num_columns = tbl.num_columns();
+   const size_type num_rows = tbl.num_rows();
+ 
+   int device_id;
+   CUDA_TRY(cudaGetDevice(&device_id));
+   int total_shmem;
+   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+ 
+   // TODO: why?
+   total_shmem -= 1024;
+   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+ 
+   // break up the work into blocks, which are a starting and ending row/col #.
+   // this window size is calculated based on the shared memory size available
+   // we want a single block to fill up the entire shared memory space available
+   // for the transpose-like conversion.
+ 
+   // There are two different processes going on here. The GPU conversion of the data
+   // and the writing of the data into the list of byte columns that are a maximum of
+   // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
+   // this limitation because the column must own the data inside and as a result it must be
+   // a distinct allocation for that column. Copying the data into these final buffers would
+   // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
+   // The windows are broken at the boundaries of specific rows based on the row sizes up
+   // to that point. These are row batches and they are decided first before building the
+   // windows so the windows can be properly cut around them.
+ 
+   // Get the pointers to the input columnar data ready
+   std::vector<int8_t const *> input_data;
+   std::vector<bitmask_type const *> input_nm;
+   input_data.reserve(num_columns);
+   input_nm.reserve(num_columns);
+   for (size_type column_number = 0; column_number < num_columns; column_number++) {
+     column_view cv = tbl.column(column_number);
+     auto const col_type = cv.type();
+     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+ 
+     if (!nested_type) {
+       input_data.emplace_back(cv.data<int8_t>());
+       input_nm.emplace_back(cv.null_mask());
+     }
+   }
+ 
+   auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+   auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
+ 
+   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
+   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
+   std::vector<size_type> column_sizes;  // byte size of each column
+   std::vector<size_type> column_starts; // offset of column inside a row including alignment
+   std::vector<column_view>
+       variable_width_columns; // list of the variable width columns in the table
+   row_sizes.reserve(num_rows);
+   row_offsets.reserve(num_rows);
+   column_sizes.reserve(num_columns);
+   column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
+ 
+   auto iter =
+       thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                       [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+                                         return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+                                       });
+ 
+   size_type fixed_width_size_per_row =
+       detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
+ 
+   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+ 
+   std::vector<detail::row_batch> row_batches;
+ 
+   uint64_t row_batch_size = 0;
+   uint64_t total_table_size = 0;
+   size_type row_batch_rows = 0;
+   uint64_t row_offset = 0;
+ 
+   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
+   // calculate the size of each row's variable-width data and validity as well.
+   auto validity_size = num_bitmask_words(num_columns) * 4;
+   // thrust
+   for (int row = 0; row < num_rows; ++row) {
+     auto aligned_row_batch_size =
+         detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
+     row_sizes[row] = fixed_width_size_per_row;
+     // validity is byte aligned
+     row_sizes[row] += validity_size;
+     // variable width data is 8-byte aligned
+     row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned
+ 
+     if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
+         (uint64_t)std::numeric_limits<size_type>::max()) {
+       // a new batch starts at the last 32-row boundary
+       row_batches.push_back(
+           detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+       row_batch_size = 0;
+       row_batch_rows = row_batch_rows & 31;
+       row_offset = 0;
+       aligned_row_batch_size = 0;
+     }
+     row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+     row_offsets.push_back(row_offset);
+     row_batch_size = aligned_row_batch_size + row_sizes[row];
+     row_offset += row_sizes[row];
+     total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+     total_table_size += row_sizes[row];
+     row_batch_rows++;
+   }
+   if (row_batch_size > 0) {
+     row_batches.push_back(
+         detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+   }
+ 
+   auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
+ 
+   std::vector<rmm::device_buffer> output_buffers;
+   std::vector<int8_t *> output_data;
+   output_data.reserve(row_batches.size());
+   for (uint i = 0; i < row_batches.size(); ++i) {
+     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+     output_data.push_back(static_cast<int8_t *>(temp.data()));
+     output_buffers.push_back(std::move(temp));
+   }
+   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+ 
+   std::vector<detail::block_info> block_infos =
+       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+ 
+   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+ 
+   // blast through the entire table and convert it
+   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
+   dim3 threads(256);
+ 
+   detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
+       num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
+       dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
+       reinterpret_cast<int8_t **>(dev_output_data.data()));
+ 
+   auto validity_block_infos =
+       build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+ 
+   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+   dim3 validity_blocks(
+       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
+   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+   detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem,
+                                        stream.value()>>>(
+       num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
+       column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
+       dev_input_nm.data());
+ 
+   // split up the output buffer into multiple buffers based on row batch sizes
+   // and create list of byte columns
+   int offset_offset = 0;
+   std::vector<std::unique_ptr<cudf::column>> ret;
+   for (uint i = 0; i < row_batches.size(); ++i) {
+     // compute offsets for this row batch
+     std::vector<size_type> offset_vals;
+     offset_vals.reserve(row_batches[i].row_count + 1);
+     size_type cur_offset = 0;
+     offset_vals.push_back(cur_offset);
+     for (int row = 0; row < row_batches[i].row_count; ++row) {
+       cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
+       offset_vals.push_back(cur_offset);
+     }
+     offset_offset += row_batches[i].row_count;
+ 
+     auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
+     auto offsets = std::make_unique<column>(data_type{type_id::INT32},
+                                             (size_type)offset_vals.size(), dev_offsets.release());
+ 
+     auto data = std::make_unique<column>(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes,
+                                          std::move(output_buffers[i]));
+ 
+     ret.push_back(
+         cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0,
+                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
+   }
+ 
+   return ret;
+ #else
+   CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
+   return {};
+ #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ }
+ 
+ std::vector<std::unique_ptr<cudf::column>>
+ convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource *mr) {
+   const cudf::size_type num_columns = tbl.num_columns();
+ 
+   std::vector<cudf::data_type> schema;
+   schema.resize(num_columns);
+   std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
+ 
+   if (detail::are_all_fixed_width(schema)) {
+     std::vector<cudf::size_type> column_start;
+     std::vector<cudf::size_type> column_size;
+ 
+     int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+     auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
+     auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
+ 
+     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
+     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
+     // splitting validity at a specific row offset.  This might change in the future.
+     max_rows_per_batch = (max_rows_per_batch / 32) * 32;
+ 
+     cudf::size_type num_rows = tbl.num_rows();
+ 
+     // Get the pointers to the input columnar data ready
+     std::vector<const int8_t *> input_data;
+     std::vector<cudf::bitmask_type const *> input_nm;
+     for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
+       cudf::column_view cv = tbl.column(column_number);
+       input_data.emplace_back(cv.data<int8_t>());
+       input_nm.emplace_back(cv.null_mask());
+     }
+     auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+     auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
+ 
+     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
+     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+     zero->set_valid_async(true, stream);
+     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
+ 
+     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+     step->set_valid_async(true, stream);
+     static_cast<ScalarType *>(step.get())
+         ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
+ 
+     std::vector<std::unique_ptr<cudf::column>> ret;
+     for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
+       cudf::size_type row_count = num_rows - row_start;
+       row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
+       ret.emplace_back(detail::fixed_width_convert_to_rows(
+           row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
+           dev_input_data, dev_input_nm, *zero, *step, stream, mr));
+     }
+ 
+     return ret;
+   } else {
+     CUDF_FAIL("Only fixed width types are currently supported");
+   }
+ }
+ 
+ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
+                                                std::vector<cudf::data_type> const &schema,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource *mr) {
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+   // verify that the types are what we expect
+   cudf::column_view child = input.child();
+   cudf::type_id list_type = child.type().id();
+   CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+                "Only a list of bytes is supported as input");
+ 
+   cudf::size_type num_columns = schema.size();
+   cudf::size_type num_rows = input.parent().size();
+ 
+   int device_id;
+   CUDA_TRY(cudaGetDevice(&device_id));
+   int total_shmem;
+   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+ 
+   // TODO why?
+   total_shmem -= 1024;
+   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+ 
+   std::vector<cudf::size_type> column_starts;
+   std::vector<cudf::size_type> column_sizes;
+ 
+   auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
+     return std::make_tuple(schema[i], nullptr);
+   });
+   size_type fixed_width_size_per_row = detail::compute_column_information(
+       iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {});
+ 
+   size_type validity_size = num_bitmask_words(num_columns) * 4;
+ 
+   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
+ 
+   // Ideally we would check that the offsets are all the same, etc. but for now
+   // this is probably fine
+   CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
+   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+ 
+   // build the row_batches from the passed in list column
+   std::vector<detail::row_batch> row_batches;
+ 
+   row_batches.push_back(detail::row_batch{child.size(), num_rows});
+ 
+   // Allocate the columns we are going to write into
+   std::vector<std::unique_ptr<cudf::column>> output_columns;
+   std::vector<int8_t *> output_data;
+   std::vector<cudf::bitmask_type *> output_nm;
+   for (cudf::size_type i = 0; i < num_columns; i++) {
+     auto column = cudf::make_fixed_width_column(schema[i], num_rows,
+                                                 cudf::mask_state::UNINITIALIZED, stream, mr);
+     auto mut = column->mutable_view();
+     output_data.emplace_back(mut.data<int8_t>());
+     output_nm.emplace_back(mut.null_mask());
+     output_columns.emplace_back(std::move(column));
+   }
+ 
+   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+   auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
+ 
+   std::vector<detail::block_info> block_infos =
+       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+ 
+   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+ 
+   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
+   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
+   detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
+       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+       dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
+       block_infos.size(), child.data<int8_t>());
+ 
+   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+   auto const column_stride = [&]() {
+     if (desired_rows_and_columns > num_columns) {
+       // not many columns, group it into 64s and ship it off
+       return std::min(64, num_columns);
+     } else {
+       return util::round_down_safe(desired_rows_and_columns, 8);
+     }
+   }();
+   auto const row_stride = [&]() {
+     // we fit as much as we can, we know the column stride now, so calculate the row
+     return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
+     /*    if (desired_rows_and_columns > num_rows) {
+           return std::min(32, num_rows);
+         } else {
+           return util::round_down_safe(desired_rows_and_columns, 32);
+         }*/
+   }();
+   std::vector<detail::block_info> validity_block_infos;
+   for (int col = 0; col < num_columns; col += column_stride) {
+     for (int row = 0; row < num_rows; row += row_stride) {
+       validity_block_infos.emplace_back(
+           detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
+                              std::min(row + row_stride - 1, num_rows - 1)});
+     }
+   }
+   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+   dim3 validity_blocks(
+       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
+ 
+   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+   detail::
+       copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+           dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
+           validity_block_infos.size(), child.data<int8_t>());
+ 
+   return std::make_unique<cudf::table>(std::move(output_columns));
+ #else
+   CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
+   return {};
+ #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ }
+ 
+ std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+     cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
+     rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+   // verify that the types are what we expect
+   cudf::column_view child = input.child();
+   cudf::type_id list_type = child.type().id();
+   CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+                "Only a list of bytes is supported as input");
+ 
+   cudf::size_type num_columns = schema.size();
+ 
+   if (detail::are_all_fixed_width(schema)) {
+     std::vector<cudf::size_type> column_start;
+     std::vector<cudf::size_type> column_size;
+ 
+     cudf::size_type num_rows = input.parent().size();
+     int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+ 
+     // Ideally we would check that the offsets are all the same, etc. but for now
+     // this is probably fine
+     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
+                  "The layout of the data appears to be off");
+     auto dev_column_start = make_device_uvector_async(column_start, stream);
+     auto dev_column_size = make_device_uvector_async(column_size, stream);
+ 
+     // Allocate the columns we are going to write into
+     std::vector<std::unique_ptr<cudf::column>> output_columns;
+     std::vector<int8_t *> output_data;
+     std::vector<cudf::bitmask_type *> output_nm;
+     for (cudf::size_type i = 0; i < num_columns; i++) {
+       auto column = cudf::make_fixed_width_column(schema[i], num_rows,
+                                                   cudf::mask_state::UNINITIALIZED, stream, mr);
+       auto mut = column->mutable_view();
+       output_data.emplace_back(mut.data<int8_t>());
+       output_nm.emplace_back(mut.null_mask());
+       output_columns.emplace_back(std::move(column));
+     }
+ 
+     auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+     auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
+ 
+     dim3 blocks;
+     dim3 threads;
+     int shared_size =
+         detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+ 
+     detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
+         num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
+         dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
+ 
+     return std::make_unique<cudf::table>(std::move(output_columns));
+   } else {
+     CUDF_FAIL("Only fixed width types are currently supported");
+   }
+ }
+ 
+ } // namespace cudf
+ 
\ No newline at end of file
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index a67589fbaec..932afa4bb70 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -50,8 +50,8 @@
 #include <thrust/iterator/transform_iterator.h>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8;
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
@@ -67,13 +67,11 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
-                                            const cudf::size_type num_columns,
-                                            const cudf::size_type row_size,
-                                            const cudf::size_type *input_offset_in_row,
-                                            const cudf::size_type *num_bytes, int8_t **output_data,
-                                            cudf::bitmask_type **output_nm,
-                                            const int8_t *input_data) {
+__global__ void copy_from_rows_fixed_width_optimized(
+    const cudf::size_type num_rows, const cudf::size_type num_columns,
+    const cudf::size_type row_size, const cudf::size_type *input_offset_in_row,
+    const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm,
+    const int8_t *input_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -190,12 +188,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
   }
 }
 
-__global__ void
-copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_type num_rows,
-                              const cudf::size_type num_columns, const cudf::size_type row_size,
-                              const cudf::size_type *output_offset_in_row,
-                              const cudf::size_type *num_bytes, const int8_t **input_data,
-                              const cudf::bitmask_type **input_nm, int8_t *output_data) {
+__global__ void copy_to_rows_fixed_width_optimized(
+    const cudf::size_type start_row, const cudf::size_type num_rows,
+    const cudf::size_type num_columns, const cudf::size_type row_size,
+    const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes,
+    const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -367,12 +364,11 @@ struct row_batch {
  * @param output_data pointer to output data
  *
  */
-__global__ void copy_from_columns(const size_type num_rows, const size_type num_columns,
-                                  const size_type shmem_used_per_block,
-                                  const size_type num_block_infos, const int8_t **input_data,
-                                  const size_type *col_sizes, const size_type *col_offsets,
-                                  const block_info *block_infos, const size_type *row_offsets,
-                                  int8_t **output_data) {
+__global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
+                             const size_type shmem_used_per_block, const size_type num_block_infos,
+                             const int8_t **input_data, const size_type *col_sizes,
+                             const size_type *col_offsets, const block_info *block_infos,
+                             const size_type *row_offsets, int8_t **output_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -396,15 +392,15 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
   group.sync();
 
   auto const blocks_remaining =
-      std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS,
-               (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS);
+      std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
+               (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS);
 
   size_t fetch;
   size_t subset;
   for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
     // Fetch ahead up to stages_count subsets
     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
-      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch];
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch];
       auto const num_fetch_cols = fetch_block.num_cols();
       auto const num_fetch_rows = fetch_block.num_rows();
       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
@@ -462,7 +458,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     subset_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
     auto const block_output_buffer = output_data[block.buffer_num];
@@ -499,7 +495,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_validity_from_columns(
+__global__ void copy_validity_to_rows(
     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
     const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
     const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
@@ -633,74 +629,6 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num
   return {col_size_bytes, col_offset_bytes};
 }
 
-/**
- * @brief ensure `read_ahead` buffer blocks are fetched
- *
- * @param fetch_index internal state passed into the function
- * @param processing_index index where processing is occuring
- * @param read_ahead_count how many blocks to read ahead
- * @param max_resident_blocks how many blocks can be loaded at once
- * @param total_blocks total number of blocks overall
- * @param block_infos pointer to the block infos
- * @param col_sizes pointer to column size information
- * @param col_offsets pointer to the table's column offsets
- * @param row_offsets pointer to offsets for each row in the table
- * @param input_data pointer to the input data
- * @param shared pointer to shared memory
- * @param group thread group participating in the fetch
- * @param block_barrier barriers used for each block
- * @return
- */
-static __device__ void
-fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_index,
-                               int const read_ahead_count, int const max_resident_blocks,
-                               int const total_blocks, block_info const *const block_infos,
-                               size_type const *const col_sizes, size_type const *const col_offsets,
-                               size_type const *const row_offsets, int8_t const *const input_data,
-                               int8_t *shared[], cooperative_groups::thread_block const group,
-                               cuda::barrier<cuda::thread_scope_block> *block_barrier) {
-  for (; fetch_index < static_cast<size_t>(total_blocks) &&
-         fetch_index < (processing_index + read_ahead_count);
-       ++fetch_index) {
-    auto const fetch_block =
-        block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
-    auto const fetch_block_start_row = fetch_block.start_row;
-    auto const fetch_block_end_row = fetch_block.end_row;
-    auto const starting_col_offset = col_offsets[fetch_block.start_col];
-    auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-    auto const num_fetch_cols = fetch_block.num_cols();
-    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-        sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
-    auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
-
-    // if we have fetched all buffers, we need to wait for processing
-    // to complete on them before we can use them again
-    if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
-      fetch_barrier.arrive_and_wait();
-    }
-
-    auto shared_row_offset = 0;
-    // copy the data for column sizes
-    cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset],
-                       &col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
-    shared_row_offset += col_size_bytes;
-    // copy the data for column offsets
-    cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset],
-                       &col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
-    shared_row_offset += col_offset_bytes;
-    shared_row_offset = align_offset(shared_row_offset, 8);
-
-    for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
-         row <= fetch_block_end_row; row += blockDim.x) {
-      auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
-      // copy the main
-      cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
-                         &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size,
-                         fetch_barrier);
-    }
-  }
-}
-
 /**
  * @brief copy data from row-based format to cudf columns
  *
@@ -716,7 +644,7 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_to_columns(const size_type num_rows, const size_type num_columns,
+__global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
                                 const size_type shmem_used_per_block, const size_type *row_offsets,
                                 int8_t **output_data, const size_type *_col_sizes,
                                 const size_type *_col_offsets, const block_info *block_infos,
@@ -746,40 +674,70 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
 
   group.sync();
 
-  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
-                                   (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
+  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
+                                   (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
+
+  size_t fetch_index;
+  size_t processing_index;
+  for (processing_index = fetch_index = 0; processing_index < blocks_remaining;
+       ++processing_index) {
+    // Fetch ahead up to stages_count groups
+    for (; fetch_index < static_cast<size_t>(blocks_remaining) &&
+           fetch_index < (processing_index + stages_count);
+         ++fetch_index) {
+      auto const fetch_block =
+          block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index];
+      auto const fetch_block_start_row = fetch_block.start_row;
+      auto const fetch_block_end_row = fetch_block.end_row;
+      auto const starting_col_offset = _col_offsets[fetch_block.start_col];
+      auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes);
+      auto const num_fetch_cols = fetch_block.num_cols();
+      auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+          sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols);
+      auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
 
-  auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)),
-                               col_offset_size = sizeof(decltype(*_col_offsets))](
-                                  int const num_cols,
-                                  int const num_rows) -> std::tuple<size_type, size_type> {
-    auto const col_size_bytes = num_cols * col_size_size;
-    auto const col_offset_bytes = num_cols * col_offset_size;
+      // if we have fetched all buffers, we need to wait for processing
+      // to complete on them before we can use them again
+      if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
+        fetch_barrier.arrive_and_wait();
+      }
 
-    return {col_size_bytes, col_offset_bytes};
-  };
+      auto shared_row_offset = 0;
+      // copy the data for column sizes
+      cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
+                         &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
+      shared_row_offset += col_size_bytes;
+      // copy the data for column offsets
+      cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
+                         &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
+      shared_row_offset += col_offset_bytes;
+      shared_row_offset = align_offset(shared_row_offset, 8);
+
+      for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
+           row <= fetch_block_end_row; row += blockDim.x) {
+        auto shared_offset =
+            (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+        // copy the main
+        cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset],
+                           &input_data[row_offsets[row] + starting_col_offset],
+                           fetch_block_row_size, fetch_barrier);
+      }
+    }
 
-  size_t fetch;
-  size_t subset;
-  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
-    // Fetch ahead up to stages_count subsets
-    fetch_blocks_for_row_to_column(fetch, subset, stages_count, stages_count, blocks_remaining,
-                                   block_infos, _col_sizes, _col_offsets, row_offsets, input_data,
-                                   shared, group, block_barrier);
+    auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED];
 
-    auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     // ensure our data is ready
-    subset_barrier.arrive_and_wait();
+    processing_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index];
     auto const rows_in_block = block.num_rows();
     auto const cols_in_block = block.num_cols();
 
-    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block);
-    // auto shared_row_offsets = shared[subset];
-    auto shared_col_sizes = reinterpret_cast<size_type *>(shared[subset % stages_count]);
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+        sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block);
+    auto shared_col_sizes = reinterpret_cast<size_type *>(shared[processing_index % stages_count]);
     auto shared_col_offsets =
-        reinterpret_cast<size_type *>(&shared[subset % stages_count][col_size_bytes]);
+        reinterpret_cast<size_type *>(&shared[processing_index % stages_count][col_size_bytes]);
 
     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
 
@@ -803,10 +761,10 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
                                         shared_memory_row_offset + shared_row_offset;
       auto const column_size = shared_col_sizes[relative_col];
 
-      int8_t *shmem_src = &shared[subset % stages_count][shared_memory_offset];
+      int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset];
       int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
 
-      cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
+      cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier);
     }
     group.sync();
   }
@@ -831,7 +789,7 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_validity_to_columns(
+__global__ void copy_validity_from_rows(
     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
     const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
     const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
@@ -1050,7 +1008,7 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty
   int shared_size =
       detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
-  copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+  copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
       start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
       input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
 
@@ -1354,18 +1312,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
   std::vector<detail::row_batch> row_batches;
 
-  auto calculate_variable_width_row_data_size = [](int const row) {
-    // each level of variable-width data will add an offset/length
-    // uint64 of data. The first of which is inside the fixed-width
-    // data itself and needs to be aligned based on what is around
-    // that data. This is handled above with the fixed-width calculations
-    // for that reason. We may still need to add more of these offset/length
-    // combinations if the nesting is deeper than one level as these
-    // will be included in the variable-width data blob at the end of the
-    // row.
-    return 0;
-  };
-
   uint64_t row_batch_size = 0;
   uint64_t total_table_size = 0;
   size_type row_batch_rows = 0;
@@ -1382,8 +1328,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     // validity is byte aligned
     row_sizes[row] += validity_size;
     // variable width data is 8-byte aligned
-    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
-                     calculate_variable_width_row_data_size(row); // rows are 8 byte aligned
+    row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned
 
     if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
         (uint64_t)std::numeric_limits<size_type>::max()) {
@@ -1426,10 +1371,10 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS));
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
   dim3 threads(256);
 
-  detail::copy_from_columns<<<blocks, threads, total_shmem, stream.value()>>>(
+  detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
       dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
       reinterpret_cast<int8_t **>(dev_output_data.data()));
@@ -1439,9 +1384,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-  detail::copy_validity_from_columns<<<validity_blocks, validity_threads, total_shmem,
+  detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem,
                                        stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
       column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
@@ -1610,9 +1555,9 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
-  detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
+  detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
       dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
       block_infos.size(), child.data<int8_t>());
@@ -1645,11 +1590,11 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   }
   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
 
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
   detail::
-      copy_validity_to_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+      copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
           dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
           validity_block_infos.size(), child.data<int8_t>());
@@ -1707,7 +1652,7 @@ std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
     int shared_size =
         detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
-    detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+    detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
         num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
         dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
 

From c4b02424dcb4794381a27e8440ea9702d0054ed4 Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Thu, 21 Oct 2021 14:49:26 -0700
Subject: [PATCH 27/80] fixed typo

---
 java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
index 9541d05ce00..e4106574a19 100644
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
@@ -393,7 +393,7 @@ public final void setInts(long offset, int[] data, long srcOffset, long len) {
    */
   public final long getLong(long offset) {
     long requestedAddress = this.address + offset;
-    addressOutOfBoundsCheck(requestedAddress, 8, "setLong");
+    addressOutOfBoundsCheck(requestedAddress, 8, "getLong");
     return UnsafeMemoryAccessor.getLong(requestedAddress);
   }
 
@@ -404,7 +404,7 @@ public final long getLong(long offset) {
    */
   public final void setLong(long offset, long value) {
     long requestedAddress = this.address + offset;
-    addressOutOfBoundsCheck(requestedAddress, 8, "getLong");
+    addressOutOfBoundsCheck(requestedAddress, 8, "setLong");
     UnsafeMemoryAccessor.setLong(requestedAddress, value);
   }
 

From e92989c822aa613cf7ed6310ff59b2a8bcf0e376 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Sat, 23 Oct 2021 01:21:15 +0000
Subject: [PATCH 28/80] Updating for actual PR. Fixed a few last minute bugs,
 removed cudf-land code that was there for testing and benchmarking.

---
 cpp/CMakeLists.txt                            |    1 -
 cpp/benchmarks/CMakeLists.txt                 |    4 -
 .../row_conversion/row_conversion.cpp         |  181 --
 cpp/src/row_conversion/row_conversion.cu      | 1666 -----------------
 cpp/tests/CMakeLists.txt                      |    4 -
 cpp/tests/row_conversion/row_conversion.cpp   |  677 -------
 java/src/main/native/src/row_conversion.cu    |   33 +-
 7 files changed, 16 insertions(+), 2550 deletions(-)
 delete mode 100644 cpp/benchmarks/row_conversion/row_conversion.cpp
 delete mode 100644 cpp/src/row_conversion/row_conversion.cu
 delete mode 100644 cpp/tests/row_conversion/row_conversion.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 785ac1f72de..82bc5bfba93 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -350,7 +350,6 @@ add_library(cudf
     src/rolling/rolling.cu
     src/rolling/rolling_collect_list.cu
     src/round/round.cu
-    src/row_conversion/row_conversion.cu
     src/scalar/scalar.cpp
     src/scalar/scalar_factories.cpp
     src/search/search.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7d353c37df7..b3b92003573 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -250,7 +250,3 @@ ConfigureBench(JSON_BENCH
 # - io benchmark ---------------------------------------------------------------------
 ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
   io/text/multibyte_split_benchmark.cpp)
-
-###################################################################################################
-# - row conversion benchmark ---------------------------------------------------------
-ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp)
diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
deleted file mode 100644
index fb8e4c8aef3..00000000000
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/row_conversion.hpp>
-#include <cudf_test/column_utilities.hpp>
-
-class RowConversion : public cudf::benchmark {
-};
-
-static void BM_old_to_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         212,
-                                         row_count{n_rows});
-
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    total_bytes += cudf::size_of(t);
-  }
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-static void BM_new_to_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         212,
-                                         row_count{n_rows});
-
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    total_bytes += cudf::size_of(t);
-  }
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto new_rows = cudf::convert_to_rows(table->view());
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-static void BM_old_from_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         256,
-                                         row_count{n_rows});
-
-  std::vector<cudf::data_type> schema;
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    schema.push_back(t);
-    total_bytes += cudf::size_of(t);
-  }
-
-  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
-  cudf::lists_column_view const first_list(rows.front()->view());
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema);
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-static void BM_new_from_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         256,
-                                         row_count{n_rows});
-
-  std::vector<cudf::data_type> schema;
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    schema.push_back(t);
-    total_bytes += cudf::size_of(t);
-  }
-
-  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
-  cudf::lists_column_view const first_list(rows.front()->view());
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto out = cudf::convert_from_rows(first_list, schema);
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
-    ->Unit(benchmark::kMillisecond);
-
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
-
-#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)             \
-  (::benchmark::State & st) { f(st); }                \
-  BENCHMARK_REGISTER_F(RowConversion, name)           \
-    ->RangeMultiplier(8)                              \
-    ->Ranges({{1 << 6, 1 << 20}})                     \
-    ->UseManualTime()                                 \
-    ->Unit(benchmark::kMillisecond);
-
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
deleted file mode 100644
index c068a2c0b76..00000000000
--- a/cpp/src/row_conversion/row_conversion.cu
+++ /dev/null
@@ -1,1666 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #include <algorithm>
- #include <iostream>
- #include <iterator>
- #include <limits>
- #include <tuple>
- 
- #include <cooperative_groups.h>
- #include <type_traits>
- 
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- #include <cuda/barrier>
- #endif
- 
- #include <cudf/column/column_factories.hpp>
- #include <cudf/detail/iterator.cuh>
- #include <cudf/detail/sequence.hpp>
- #include <cudf/detail/utilities/cuda.cuh>
- #include <cudf/detail/utilities/integer_utils.hpp>
- #include <cudf/detail/utilities/vector_factories.hpp>
- #include <cudf/lists/lists_column_device_view.cuh>
- #include <cudf/row_conversion.hpp>
- #include <cudf/scalar/scalar_factories.hpp>
- #include <cudf/table/table.hpp>
- #include <cudf/types.hpp>
- #include <cudf/utilities/bit.hpp>
- #include <cudf/utilities/error.hpp>
- #include <cudf/utilities/traits.hpp>
- #include <rmm/cuda_stream_view.hpp>
- #include <rmm/device_buffer.hpp>
- #include <rmm/device_uvector.hpp>
- #include <rmm/exec_policy.hpp>
- #include <thrust/binary_search.h>
- #include <thrust/iterator/counting_iterator.h>
- #include <thrust/iterator/transform_iterator.h>
- 
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8;
- constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
- constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
- constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
- constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
- #endif
- 
- using cudf::detail::make_device_uvector_async;
- using rmm::device_uvector;
- namespace cudf {
- 
- namespace detail {
- 
- static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) {
-   return (offset + alignment - 1) & ~(alignment - 1);
- }
- 
- __global__ void copy_from_rows_fixed_width_optimized(
-     const cudf::size_type num_rows, const cudf::size_type num_columns,
-     const cudf::size_type row_size, const cudf::size_type *input_offset_in_row,
-     const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm,
-     const int8_t *input_data) {
-   // We are going to copy the data in two passes.
-   // The first pass copies a chunk of data into shared memory.
-   // The second pass copies that chunk from shared memory out to the final location.
- 
-   // Because shared memory is limited we copy a subset of the rows at a time.
-   // For simplicity we will refer to this as a row_group
- 
-   // In practice we have found writing more than 4 columns of data per thread
-   // results in performance loss. As such we are using a 2 dimensional
-   // kernel in terms of threads, but not in terms of blocks. Columns are
-   // controlled by the y dimension (there is no y dimension in blocks). Rows
-   // are controlled by the x dimension (there are multiple blocks in the x
-   // dimension).
- 
-   cudf::size_type rows_per_group = blockDim.x;
-   cudf::size_type row_group_start = blockIdx.x;
-   cudf::size_type row_group_stride = gridDim.x;
-   cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
- 
-   extern __shared__ int8_t shared_data[];
- 
-   // Because we are copying fixed width only data and we stride the rows
-   // this thread will always start copying from shared data in the same place
-   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-   int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
- 
-   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
-        row_group_index += row_group_stride) {
-     // Step 1: Copy the data into shared memory
-     // We know row_size is always aligned with and a multiple of int64_t;
-     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-     const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
- 
-     cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
-     cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
-     cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-     if (row_index_end > num_rows) {
-       row_index_end = num_rows;
-     }
-     cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-     cudf::size_type shared_length = row_size * num_rows_in_group;
- 
-     cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
- 
-     cudf::size_type start_input_index =
-         (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
- 
-     for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
-          shared_index += shared_output_stride) {
-       long_shared[shared_index] = long_input[start_input_index + shared_index];
-     }
-     // Wait for all of the data to be in shared memory
-     __syncthreads();
- 
-     // Step 2 copy the data back out
- 
-     // Within the row group there should be 1 thread for each row.  This is a
-     // requirement for launching the kernel
-     cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
-     // But we might not use all of the threads if the number of rows does not go
-     // evenly into the thread count. We don't want those threads to exit yet
-     // because we may need them to copy data in for the next row group.
-     uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
-     if (row_index < num_rows) {
-       cudf::size_type col_index_start = threadIdx.y;
-       cudf::size_type col_index_stride = blockDim.y;
-       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
-            col_index += col_index_stride) {
-         cudf::size_type col_size = num_bytes[col_index];
-         const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
-         int8_t *col_output = output_data[col_index];
-         switch (col_size) {
-           case 1: {
-             col_output[row_index] = *col_tmp;
-             break;
-           }
-           case 2: {
-             int16_t *short_col_output = reinterpret_cast<int16_t *>(col_output);
-             short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
-             break;
-           }
-           case 4: {
-             int32_t *int_col_output = reinterpret_cast<int32_t *>(col_output);
-             int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
-             break;
-           }
-           case 8: {
-             int64_t *long_col_output = reinterpret_cast<int64_t *>(col_output);
-             long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
-             break;
-           }
-           default: {
-             cudf::size_type output_offset = col_size * row_index;
-             // TODO this should just not be supported for fixed width columns, but just in case...
-             for (cudf::size_type b = 0; b < col_size; b++) {
-               col_output[b + output_offset] = col_tmp[b];
-             }
-             break;
-           }
-         }
- 
-         cudf::bitmask_type *nm = output_nm[col_index];
-         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
-         cudf::size_type byte_bit_offset = col_index % 8;
-         int predicate = *valid_byte & (1 << byte_bit_offset);
-         uint32_t bitmask = __ballot_sync(active_mask, predicate);
-         if (row_index % 32 == 0) {
-           nm[word_index(row_index)] = bitmask;
-         }
-       } // end column loop
-     }   // end row copy
-     // wait for the row_group to be totally copied before starting on the next row group
-     __syncthreads();
-   }
- }
- 
- __global__ void copy_to_rows_fixed_width_optimized(
-     const cudf::size_type start_row, const cudf::size_type num_rows,
-     const cudf::size_type num_columns, const cudf::size_type row_size,
-     const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes,
-     const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) {
-   // We are going to copy the data in two passes.
-   // The first pass copies a chunk of data into shared memory.
-   // The second pass copies that chunk from shared memory out to the final location.
- 
-   // Because shared memory is limited we copy a subset of the rows at a time.
-   // We do not support copying a subset of the columns in a row yet, so we don't
-   // currently support a row that is wider than shared memory.
-   // For simplicity we will refer to this as a row_group
- 
-   // In practice we have found reading more than 4 columns of data per thread
-   // results in performance loss. As such we are using a 2 dimensional
-   // kernel in terms of threads, but not in terms of blocks. Columns are
-   // controlled by the y dimension (there is no y dimension in blocks). Rows
-   // are controlled by the x dimension (there are multiple blocks in the x
-   // dimension).
- 
-   cudf::size_type rows_per_group = blockDim.x;
-   cudf::size_type row_group_start = blockIdx.x;
-   cudf::size_type row_group_stride = gridDim.x;
-   cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
- 
-   extern __shared__ int8_t shared_data[];
- 
-   // Because we are copying fixed width only data and we stride the rows
-   // this thread will always start copying to shared data in the same place
-   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-   int8_t *row_vld_tmp =
-       &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
- 
-   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
-        row_group_index += row_group_stride) {
-     // Within the row group there should be 1 thread for each row.  This is a
-     // requirement for launching the kernel
-     cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
-     // But we might not use all of the threads if the number of rows does not go
-     // evenly into the thread count. We don't want those threads to exit yet
-     // because we may need them to copy data back out.
-     if (row_index < (start_row + num_rows)) {
-       cudf::size_type col_index_start = threadIdx.y;
-       cudf::size_type col_index_stride = blockDim.y;
-       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
-            col_index += col_index_stride) {
-         cudf::size_type col_size = num_bytes[col_index];
-         int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
-         const int8_t *col_input = input_data[col_index];
-         switch (col_size) {
-           case 1: {
-             *col_tmp = col_input[row_index];
-             break;
-           }
-           case 2: {
-             const int16_t *short_col_input = reinterpret_cast<const int16_t *>(col_input);
-             *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
-             break;
-           }
-           case 4: {
-             const int32_t *int_col_input = reinterpret_cast<const int32_t *>(col_input);
-             *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
-             break;
-           }
-           case 8: {
-             const int64_t *long_col_input = reinterpret_cast<const int64_t *>(col_input);
-             *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
-             break;
-           }
-           default: {
-             cudf::size_type input_offset = col_size * row_index;
-             // TODO this should just not be supported for fixed width columns, but just in case...
-             for (cudf::size_type b = 0; b < col_size; b++) {
-               col_tmp[b] = col_input[b + input_offset];
-             }
-             break;
-           }
-         }
-         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
-         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
-         cudf::size_type byte_bit_offset = col_index % 8;
-         uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
-         int32_t *valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
-         cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
-         // Now copy validity for the column
-         if (input_nm[col_index]) {
-           if (bit_is_set(input_nm[col_index], row_index)) {
-             atomicOr_block(valid_int, 1 << int_bit_offset);
-           } else {
-             atomicAnd_block(valid_int, ~(1 << int_bit_offset));
-           }
-         } else {
-           // It is valid so just set the bit
-           atomicOr_block(valid_int, 1 << int_bit_offset);
-         }
-       } // end column loop
-     }   // end row copy
-     // wait for the row_group to be totally copied into shared memory
-     __syncthreads();
- 
-     // Step 2: Copy the data back out
-     // We know row_size is always aligned with and a multiple of int64_t;
-     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-     int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
- 
-     cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
-     cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
-     cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-     if (row_index_end > num_rows) {
-       row_index_end = num_rows;
-     }
-     cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-     cudf::size_type shared_length = row_size * num_rows_in_group;
- 
-     cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
- 
-     cudf::size_type start_output_index =
-         (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
- 
-     for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
-          shared_index += shared_input_stride) {
-       long_output[start_output_index + shared_index] = long_shared[shared_index];
-     }
-     __syncthreads();
-     // Go for the next round
-   }
- }
- 
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- 
- struct block_info {
-   int start_col;
-   int start_row;
-   int end_col;
-   int end_row;
-   int buffer_num;
- 
-   __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
-                                                     size_type const *const col_sizes) const {
-     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
-   }
-   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
- 
-   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
- };
- 
- // When building the columns to return, we have to be mindful of the offset limit in cudf.
- // It is 32-bit and these data columns are capable of surpassing that easily. The data should
- // not be cut off exactly at the limit though due to the validity buffers. The most efficient
- // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
- // we keep track of the cut points for the validity, which we call row batches. If the row
- // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
- // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
- // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
- // the outgoing data stream.
- struct row_batch {
-   size_type num_bytes;
-   size_type row_count;
- };
- 
- /**
-  * @brief copy data from cudf columns into x format, which is row-based
-  *
-  * @param num_rows total number of rows in the table
-  * @param num_columns total number of columns in the table
-  * @param input_data pointer to raw table data
-  * @param input_nm pointer to validity data
-  * @param col_sizes array of sizes for each element in a column - one per column
-  * @param col_offsets offset into input data row for each column's start
-  * @param block_infos information about the blocks of work
-  * @param row_offsets offset to a specific row in the input data
-  * @param output_data pointer to output data
-  *
-  */
- __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
-                              const size_type shmem_used_per_block, const size_type num_block_infos,
-                              const int8_t **input_data, const size_type *col_sizes,
-                              const size_type *col_offsets, const block_info *block_infos,
-                              const size_type *row_offsets, int8_t **output_data) {
-   // We are going to copy the data in two passes.
-   // The first pass copies a chunk of data into shared memory.
-   // The second pass copies that chunk from shared memory out to the final location.
- 
-   // Because shared memory is limited we copy a subset of the rows at a time.
-   // This has been broken up for us in the block_info struct, so we don't have
-   // any calculation to do here, but it is important to note.
- 
-   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
-   auto group = cooperative_groups::this_thread_block();
-   extern __shared__ int8_t shared_data[];
-   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
- 
-   __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
-   if (group.thread_rank() == 0) {
-     for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-       init(&block_barrier[i], group.size());
-     }
-   }
- 
-   group.sync();
- 
-   auto const blocks_remaining =
-       std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
-                (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS);
- 
-   size_t fetch;
-   size_t subset;
-   for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
-     // Fetch ahead up to stages_count subsets
-     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
-       auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch];
-       auto const num_fetch_cols = fetch_block.num_cols();
-       auto const num_fetch_rows = fetch_block.num_rows();
-       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
-       auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-       auto const starting_column_offset = col_offsets[fetch_block.start_col];
-       auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
- 
-       // wait for the last use of the memory to be completed
-       if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) {
-         fetch_barrier.arrive_and_wait();
-       }
- 
-       // to do the copy we need to do n column copies followed by m element copies OR
-       // we have to do m element copies followed by r row copies. When going from column
-       // to row it is much easier to copy by elements first otherwise we would need a running
-       // total of the column sizes for our block, which isn't readily available. This makes it
-       // more appealing to copy element-wise from input data into shared matching the end layout
-       // and do row-based memcopies out.
- 
-       auto const shared_buffer_base = shared[fetch % stages_count];
-       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
-         auto const relative_col = el / num_fetch_rows;
-         auto const relative_row = el % num_fetch_rows;
-         auto const absolute_col = relative_col + fetch_block.start_col;
-         auto const absolute_row = relative_row + fetch_block.start_row;
-         auto const col_size = col_sizes[absolute_col];
-         auto const col_offset = col_offsets[absolute_col];
-         auto const relative_col_offset = col_offset - starting_column_offset;
- 
-         auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
-         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
- 
-         // copy the element from global memory
-         switch (col_size) {
-           case 2:
-             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
-                                cuda::aligned_size_t<2>(col_size), fetch_barrier);
-             break;
-           case 4:
-             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
-                                cuda::aligned_size_t<4>(col_size), fetch_barrier);
-             break;
-           case 8:
-             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
-                                cuda::aligned_size_t<8>(col_size), fetch_barrier);
-             break;
-           default:
-             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size,
-                                fetch_barrier);
-             break;
-         }
-       }
-     }
- 
-     auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
-     subset_barrier.arrive_and_wait();
- 
-     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
-     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
-     auto const column_offset = col_offsets[block.start_col];
-     auto const block_output_buffer = output_data[block.buffer_num];
- 
-     // copy entire rows to final dest
-     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
-          absolute_row += blockDim.x) {
-       auto const relative_row = absolute_row - block.start_row;
-       auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset;
-       auto const shared_offset = block_row_size * relative_row;
- 
-       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset],
-                          cuda::aligned_size_t<8>(block_row_size), subset_barrier);
-     }
-   }
- 
-   // wait on the last copies to complete
-   for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-     block_barrier[i].arrive_and_wait();
-   }
- }
- 
- /**
-  * @brief copy data from row-based format to cudf columns
-  *
-  * @param num_rows total number of rows in the table
-  * @param num_columns total number of columns in the table
-  * @param shmem_used_per_block amount of shared memory that is used by a block
-  * @param offsets
-  * @param output_data pointer to output data, partitioned by data size
-  * @param validity_offsets offset into input data row for validity data
-  * @param block_infos information about the blocks of work
-  * @param num_block_infos number of infos in blocks array
-  * @param input_data pointer to input data
-  *
-  */
- __global__ void copy_validity_to_rows(
-     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
-     const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
-     const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
-   extern __shared__ int8_t shared_data[];
-   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-       shared_data, shared_data + shmem_used_per_block / 2};
- 
-   // per conversation with DaveB
-   // each thread of warp reads a single int32 of validity - so we read 128 bytes
-   // then ballot_sync the bits and write the result to shmem
-   // after we fill shared mem memcpy it out in a blob.
-   // probably need knobs for number of rows vs columns to balance read/write
-   auto group = cooperative_groups::this_thread_block();
- 
-   int const blocks_remaining =
-       std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
- 
-   __shared__ cuda::barrier<cuda::thread_scope_block>
-       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
-   if (group.thread_rank() == 0) {
-     for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-       init(&shared_block_barriers[i], group.size());
-     }
-   }
- 
-   group.sync();
- 
-   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-     if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
-       shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
-           .arrive_and_wait();
-     }
-     int8_t *this_shared_block = shared_blocks[validity_block % 2];
-     auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
- 
-     auto const num_block_cols = block.num_cols();
-     auto const num_block_rows = block.num_rows();
- 
-     auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
-     auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
-     auto const validity_data_row_length =
-         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
-     auto const total_sections = num_sections_x * num_sections_y;
- 
-     int const warp_id = threadIdx.x / detail::warp_size;
-     int const lane_id = threadIdx.x % detail::warp_size;
-     auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
- 
-     // the block is divided into sections. A warp operates on a section at a time.
-     for (int my_section_idx = warp_id; my_section_idx < total_sections;
-          my_section_idx += warps_per_block) {
-       // convert to rows and cols
-       auto const section_x = my_section_idx % num_sections_x;
-       auto const section_y = my_section_idx / num_sections_x;
-       auto const relative_col = section_x * 32 + lane_id;
-       auto const relative_row = section_y * 32;
-       auto const absolute_col = relative_col + block.start_col;
-       auto const absolute_row = relative_row + block.start_row;
-       auto const cols_left = num_columns - absolute_col;
-       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
- 
-       if (absolute_col < num_columns) {
-         auto my_data = input_nm[absolute_col] != nullptr ?
-                            input_nm[absolute_col][absolute_row / 32] :
-                            std::numeric_limits<uint32_t>::max();
- 
-         // every thread that is participating in the warp has a byte, but it's column-based
-         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
-         // make the bytes we actually write.
-         bitmask_type dw_mask = 1;
-         for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
-           auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
-           // lead thread in each warp writes data
-           auto const validity_write_offset =
-               validity_data_row_length * (relative_row + i) + relative_col / 8;
-           if (threadIdx.x % detail::warp_size == 0) {
-             if (cols_left <= 8) {
-               // write byte
-               this_shared_block[validity_write_offset] = validity_data & 0xFF;
-             } else if (cols_left <= 16) {
-               // write int16
-               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data & 0xFFFF;
-             } else if (cols_left <= 24) {
-               // write int16 and then int8
-               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data & 0xFFFF;
-               shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-             } else {
-               // write int32
-               *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data;
-             }
-           }
-         }
-       }
-     }
- 
-     // make sure entire block has finished copy
-     group.sync();
- 
-     auto const output_data_base =
-         output_data[block.buffer_num] + validity_offset + block.start_col / 8;
- 
-     // now async memcpy the shared memory out to the final destination
-     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
-       auto const relative_row = row - block.start_row;
-       auto const output_ptr = output_data_base + row_offsets[row];
-       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
- 
-       cuda::memcpy_async(
-           output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes,
-           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-     }
-   }
- 
-   // wait for last blocks of data to arrive
-   for (int validity_block = 0;
-        validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-        ++validity_block) {
-     shared_block_barriers[validity_block].arrive_and_wait();
-   }
- }
- 
- static __device__ std::tuple<size_type, size_type>
- get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) {
-   auto const col_size_bytes = num_cols * col_size_size;
-   auto const col_offset_bytes = num_cols * col_offset_size;
- 
-   return {col_size_bytes, col_offset_bytes};
- }
- 
- /**
-  * @brief copy data from row-based format to cudf columns
-  *
-  * @param num_rows total number of rows in the table
-  * @param num_columns total number of columns in the table
-  * @param shmem_used_per_block amount of shared memory that is used by a block
-  * @param row_offsets
-  * @param output_data
-  * @param output_nm
-  * @param col_sizes array of sizes for each element in a column - one per column
-  * @param col_offsets offset into input data row for each column's start
-  * @param block_infos information about the blocks of work
-  * @param input_data pointer to input data
-  *
-  */
- __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
-                                 const size_type shmem_used_per_block, const size_type *row_offsets,
-                                 int8_t **output_data, const size_type *_col_sizes,
-                                 const size_type *_col_offsets, const block_info *block_infos,
-                                 const size_type num_block_infos, const int8_t *input_data) {
-   // We are going to copy the data in two passes.
-   // The first pass copies a chunk of data into shared memory.
-   // The second pass copies that chunk from shared memory out to the final location.
- 
-   // Because shared memory is limited we copy a subset of the rows at a time.
-   // This has been broken up for us in the block_info struct, so we don't have
-   // any calculation to do here, but it is important to note.
- 
-   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
-   // to shared memory for each of the blocks that we work on
- 
-   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
-   auto group = cooperative_groups::this_thread_block();
-   extern __shared__ int8_t shared_data[];
-   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
- 
-   __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
-   if (group.thread_rank() == 0) {
-     for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-       init(&block_barrier[i], group.size());
-     }
-   }
- 
-   group.sync();
- 
-   auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
-                                    (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
- 
-   size_t fetch_index;
-   size_t processing_index;
-   for (processing_index = fetch_index = 0; processing_index < blocks_remaining;
-        ++processing_index) {
-     // Fetch ahead up to stages_count groups
-     for (; fetch_index < static_cast<size_t>(blocks_remaining) &&
-            fetch_index < (processing_index + stages_count);
-          ++fetch_index) {
-       auto const fetch_block =
-           block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index];
-       auto const fetch_block_start_row = fetch_block.start_row;
-       auto const fetch_block_end_row = fetch_block.end_row;
-       auto const starting_col_offset = _col_offsets[fetch_block.start_col];
-       auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes);
-       auto const num_fetch_cols = fetch_block.num_cols();
-       auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-           sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols);
-       auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
- 
-       // if we have fetched all buffers, we need to wait for processing
-       // to complete on them before we can use them again
-       if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
-         fetch_barrier.arrive_and_wait();
-       }
- 
-       auto shared_row_offset = 0;
-       // copy the data for column sizes
-       cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
-                          &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
-       shared_row_offset += col_size_bytes;
-       // copy the data for column offsets
-       cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
-                          &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
-       shared_row_offset += col_offset_bytes;
-       shared_row_offset = align_offset(shared_row_offset, 8);
- 
-       for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
-            row <= fetch_block_end_row; row += blockDim.x) {
-         auto shared_offset =
-             (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
-         // copy the main
-         cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset],
-                            &input_data[row_offsets[row] + starting_col_offset],
-                            fetch_block_row_size, fetch_barrier);
-       }
-     }
- 
-     auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED];
- 
-     // ensure our data is ready
-     processing_barrier.arrive_and_wait();
- 
-     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index];
-     auto const rows_in_block = block.num_rows();
-     auto const cols_in_block = block.num_cols();
- 
-     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-         sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block);
-     auto shared_col_sizes = reinterpret_cast<size_type *>(shared[processing_index % stages_count]);
-     auto shared_col_offsets =
-         reinterpret_cast<size_type *>(&shared[processing_index % stages_count][col_size_bytes]);
- 
-     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
- 
-     auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes);
- 
-     // now we copy from shared memory to final destination.
-     // the data is laid out in rows in shared memory, so the reads
-     // for a column will be "vertical". Because of this and the different
-     // sizes for each column, this portion is handled on row/column basis.
-     // to prevent each thread working on a single row and also to ensure
-     // that all threads can do work in the case of more threads than rows,
-     // we do a global index instead of a double for loop with col/row.
-     for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
-       auto const relative_col = index % cols_in_block;
-       auto const relative_row = index / cols_in_block;
-       auto const absolute_col = relative_col + block.start_col;
-       auto const absolute_row = relative_row + block.start_row;
- 
-       auto const shared_memory_row_offset = block_row_size * relative_row;
-       auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
-                                         shared_memory_row_offset + shared_row_offset;
-       auto const column_size = shared_col_sizes[relative_col];
- 
-       int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset];
-       int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
- 
-       cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier);
-     }
-     group.sync();
-   }
- 
-   // wait on the last copies to complete
-   for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-     block_barrier[i].arrive_and_wait();
-   }
- }
- 
- /**
-  * @brief copy data from row-based format to cudf columns
-  *
-  * @param num_rows total number of rows in the table
-  * @param num_columns total number of columns in the table
-  * @param shmem_used_per_block amount of shared memory that is used by a block
-  * @param offsets
-  * @param output_nm
-  * @param validity_offsets offset into input data row for validity data
-  * @param block_infos information about the blocks of work
-  * @param num_block_infos number of infos in blocks array
-  * @param input_data pointer to input data
-  *
-  */
- __global__ void copy_validity_from_rows(
-     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
-     const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
-     const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
-   extern __shared__ int8_t shared_data[];
-   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-       shared_data, shared_data + shmem_used_per_block / 2};
- 
-   // per conversation with DaveB
-   // each thread of warp reads a single byte of validity - so we read 32 bytes
-   // then ballot_sync the bits and write the result to shmem
-   // after we fill shared mem memcpy it out in a blob.
-   // probably need knobs for number of rows vs columns to balance read/write
-   auto group = cooperative_groups::this_thread_block();
- 
-   int const blocks_remaining =
-       std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
- 
-   __shared__ cuda::barrier<cuda::thread_scope_block>
-       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
-   if (group.thread_rank() == 0) {
-     for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-       init(&shared_block_barriers[i], group.size());
-     }
-   }
- 
-   group.sync();
- 
-   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-     auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-     if (validity_block != validity_index) {
-       shared_block_barriers[validity_index].arrive_and_wait();
-     }
-     int8_t *this_shared_block = shared_blocks[validity_block % 2];
-     auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
-     auto const block_start_col = block.start_col;
-     auto const block_start_row = block.start_row;
-     auto const num_block_cols = block.num_cols();
-     auto const num_block_rows = block.num_rows();
-     auto const num_sections_x = (num_block_cols + 7) / 8;
-     auto const num_sections_y = (num_block_rows + 31) / 32;
-     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
-     auto const total_sections = num_sections_x * num_sections_y;
-     int const warp_id = threadIdx.x / detail::warp_size;
-     int const lane_id = threadIdx.x % detail::warp_size;
-     auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
- 
-     // the block is divided into sections. A warp operates on a section at a time.
-     for (int my_section_idx = warp_id; my_section_idx < total_sections;
-          my_section_idx += warps_per_block) {
-       // convert to rows and cols
-       auto const section_x = my_section_idx % num_sections_x;
-       auto const section_y = my_section_idx / num_sections_x;
-       auto const relative_col = section_x * 8;
-       auto const relative_row = section_y * 32 + lane_id;
-       auto const absolute_col = relative_col + block_start_col;
-       auto const absolute_row = relative_row + block_start_row;
-       auto const rows_left = num_rows - absolute_row;
- 
-       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
- 
-       if (absolute_row < num_rows) {
-         auto const my_byte =
-             input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
- 
-         // so every thread that is participating in the warp has a byte, but it's row-based
-         // data and we need it in column-based. So we shiffle the bits around to make
-         // the bytes we actually write.
-         for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
-              ++i, byte_mask <<= 1) {
-           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
-           // lead thread in each warp writes data
-           if (threadIdx.x % detail::warp_size == 0) {
-             auto const validity_write_offset =
-                 validity_data_col_length * (relative_col + i) + relative_row / 8;
- 
-             if (rows_left <= 8) {
-               // write byte
-               this_shared_block[validity_write_offset] = validity_data & 0xFF;
-             } else if (rows_left <= 16) {
-               // write int16
-               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data & 0xFFFF;
-             } else if (rows_left <= 24) {
-               // write int16 and then int8
-               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data & 0xFFFF;
-               shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-             } else {
-               // write int32
-               *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data;
-             }
-           }
-         }
-       }
-     }
- 
-     // make sure entire block has finished copy
-     group.sync();
- 
-     // now async memcpy the shared
-     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
-       auto const relative_col = col - block.start_col;
- 
-       cuda::memcpy_async(
-           output_nm[col] + word_index(block_start_row),
-           &this_shared_block[validity_data_col_length * relative_col],
-           util::div_rounding_up_unsafe(num_block_rows, 8),
-           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-     }
-   }
- 
-   // wait for last blocks of data to arrive
-   auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ?
-                                       NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED :
-                                       blocks_remaining;
-   for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
-     shared_block_barriers[validity_block].arrive_and_wait();
-   }
- }
- 
- #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- 
- /**
-  * Calculate the dimensions of the kernel for fixed width only columns.
-  * @param [in] num_columns the number of columns being copied.
-  * @param [in] num_rows the number of rows being copied.
-  * @param [in] size_per_row the size each row takes up when padded.
-  * @param [out] blocks the size of the blocks for the kernel
-  * @param [out] threads the size of the threads for the kernel
-  * @return the size in bytes of shared memory needed for each block.
-  */
- static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
-                                         const cudf::size_type num_rows,
-                                         const cudf::size_type size_per_row, dim3 &blocks,
-                                         dim3 &threads) {
-   // We have found speed degrades when a thread handles more than 4 columns.
-   // Each block is 2 dimensional. The y dimension indicates the columns.
-   // We limit this to 32 threads in the y dimension so we can still
-   // have at least 32 threads in the x dimension (1 warp) which should
-   // result in better coalescing of memory operations. We also
-   // want to guarantee that we are processing a multiple of 32 threads
-   // in the x dimension because we use atomic operations at the block
-   // level when writing validity data out to main memory, and that would
-   // need to change if we split a word of validity data between blocks.
-   int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4);
-   if (y_block_size > 32) {
-     y_block_size = 32;
-   }
-   int x_possible_block_size = 1024 / y_block_size;
-   // 48KB is the default setting for shared memory per block according to the cuda tutorials
-   // If someone configures the GPU to only have 16 KB this might not work.
-   int max_shared_size = 48 * 1024;
-   int max_block_size = max_shared_size / size_per_row;
-   // If we don't have enough shared memory there is no point in having more threads
-   // per block that will just sit idle
-   max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
-   // Make sure that the x dimension is a multiple of 32 this not only helps
-   // coalesce memory access it also lets us do a ballot sync for validity to write
-   // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
-   // dimension is associated with one or more warps, that should correspond to the validity
-   // words directly.
-   int block_size = (max_block_size / 32) * 32;
-   CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
- 
-   int num_blocks = (num_rows + block_size - 1) / block_size;
-   if (num_blocks < 1) {
-     num_blocks = 1;
-   } else if (num_blocks > 10240) {
-     // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
-     // but in practice haveing too many can cause some overhead that I don't totally
-     // understand. Playing around with this haveing as little as 600 blocks appears
-     // to be able to saturate memory on V100, so this is an order of magnitude higher
-     // to try and future proof this a bit.
-     num_blocks = 10240;
-   }
-   blocks.x = num_blocks;
-   blocks.y = 1;
-   blocks.z = 1;
-   threads.x = block_size;
-   threads.y = y_block_size;
-   threads.z = 1;
-   return size_per_row * block_size;
- }
- 
- /**
-  * When converting to rows it is possible that the size of the table was too big to fit
-  * in a single column. This creates an output column for a subset of the rows in a table
-  * going from start row and containing the next num_rows.  Most of the parameters passed
-  * into this function are common between runs and should be calculated once.
-  */
- static std::unique_ptr<cudf::column>
- fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows,
-                             const cudf::size_type num_columns, const cudf::size_type size_per_row,
-                             rmm::device_uvector<cudf::size_type> &column_start,
-                             rmm::device_uvector<cudf::size_type> &column_size,
-                             rmm::device_uvector<const int8_t *> &input_data,
-                             rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
-                             const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row,
-                             rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
-   int64_t total_allocation = size_per_row * num_rows;
-   // We made a mistake in the split somehow
-   CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
- 
-   // Allocate and set the offsets row for the byte array
-   std::unique_ptr<cudf::column> offsets =
-       cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
- 
-   std::unique_ptr<cudf::column> data = cudf::make_numeric_column(
-       cudf::data_type(cudf::type_id::INT8), static_cast<cudf::size_type>(total_allocation),
-       cudf::mask_state::UNALLOCATED, stream, mr);
- 
-   dim3 blocks;
-   dim3 threads;
-   int shared_size =
-       detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
- 
-   copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-       start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
-       input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
- 
-   return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                                  rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
- }
- 
- static cudf::data_type get_data_type(const cudf::column_view &v) {
-   return v.type();
- }
- 
- static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
-   return std::all_of(schema.begin(), schema.end(),
-                      [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
- }
- 
- /**
-  * Given a set of fixed width columns, calculate how the data will be laid out in memory.
-  * @param [in] schema the types of columns that need to be laid out.
-  * @param [out] column_start the byte offset where each column starts in the row.
-  * @param [out] column_size the size in bytes of the data for each columns in the row.
-  * @return the size in bytes each row needs.
-  */
- static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
-                                                  std::vector<cudf::size_type> &column_start,
-                                                  std::vector<cudf::size_type> &column_size) {
-   // We guarantee that the start of each column is 64-bit aligned so anything can go
-   // there, but to make the code simple we will still do an alignment for it.
-   int32_t at_offset = 0;
-   for (auto col = schema.begin(); col < schema.end(); col++) {
-     cudf::size_type s = cudf::size_of(*col);
-     column_size.emplace_back(s);
-     std::size_t allocation_needed = s;
-     std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
-     at_offset = align_offset(at_offset, alignment_needed);
-     column_start.emplace_back(at_offset);
-     at_offset += allocation_needed;
-   }
- 
-   // Now we need to add in space for validity
-   // Eventually we can think about nullable vs not nullable, but for now we will just always add
-   // it in
-   int32_t validity_bytes_needed =
-       (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
-   // validity comes at the end and is byte aligned so we can pack more in.
-   at_offset += validity_bytes_needed;
-   // Now we need to pad the end so all rows are 64 bit aligned
-   return align_offset(at_offset, 8); // 8 bytes (64 bits)
- }
- 
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- 
- template <typename iterator>
- static size_type compute_column_information(iterator begin, iterator end,
-                                             std::vector<size_type> &column_starts,
-                                             std::vector<size_type> &column_sizes) //,
- // std::function<void(T)> nested_type_cb)
- {
-   size_type fixed_width_size_per_row = 0;
-   for (auto cv = begin; cv != end; ++cv) {
-     auto col_type = std::get<0>(*cv);
-     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
- 
-     //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
- 
-     // a list or string column will write a single uint64
-     // of data here for offset/length
-     auto col_size = nested_type ? 8 : size_of(col_type);
- 
-     // align size for this type
-     std::size_t const alignment_needed = col_size; // They are the same for fixed width types
-     fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-     column_starts.push_back(fixed_width_size_per_row);
-     column_sizes.push_back(col_size);
-     fixed_width_size_per_row += col_size;
-   }
- 
-   auto validity_offset = fixed_width_size_per_row;
-   column_starts.push_back(validity_offset);
- 
-   return fixed_width_size_per_row;
- }
- 
- std::vector<detail::block_info>
- build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
-                            size_type const &shmem_limit_per_block,
-                            std::vector<row_batch> const &row_batches) {
-   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-   auto const column_stride = align_offset(
-       [&]() {
-         if (desired_rows_and_columns > num_columns) {
-           // not many columns, group it into 8s and ship it off
-           return std::min(8, num_columns);
-         } else {
-           return util::round_down_safe(desired_rows_and_columns, 8);
-         }
-       }(),
-       8);
-   // we fit as much as we can given the column stride
-   // note that an element in the table takes just 1 bit, but a row with a single
-   // element still takes 8 bytes!
-   auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
-   auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
- 
-   std::vector<detail::block_info> validity_block_infos;
-   for (int col = 0; col < num_columns; col += column_stride) {
-     int current_window_row_batch = 0;
-     int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-     int row = 0;
-     while (row < num_rows) {
-       if (rows_left_in_batch == 0) {
-         current_window_row_batch++;
-         rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-       }
-       int const window_height = std::min(row_stride, rows_left_in_batch);
- 
-       validity_block_infos.emplace_back(detail::block_info{
-           col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
-       row += window_height;
-       rows_left_in_batch -= window_height;
-     }
-   }
- 
-   return validity_block_infos;
- }
- 
- std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
-                                           std::vector<size_type> const &column_starts,
-                                           std::vector<row_batch> const &row_batches,
-                                           size_type const total_number_of_rows,
-                                           size_type const &shmem_limit_per_block) {
-   std::vector<block_info> block_infos;
- 
-   // block infos are organized with the windows going "down" the columns
-   // this provides the most coalescing of memory access
-   int current_window_width = 0;
-   int current_window_start_col = 0;
- 
-   // build the blocks for a specific set of columns
-   auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
-                           int const start_col, int const end_col, int const desired_window_height) {
-     int current_window_start_row = 0;
-     int current_window_row_batch = 0;
-     int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-     int i = 0;
-     while (i < total_number_of_rows) {
-       if (rows_left_in_batch == 0) {
-         current_window_row_batch++;
-         rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-       }
-       int const window_height = std::min(desired_window_height, rows_left_in_batch);
- 
-       block_infos.emplace_back(detail::block_info{
-           start_col, current_window_start_row, end_col,
-           std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
-           current_window_row_batch});
- 
-       i += window_height;
-       current_window_start_row += window_height;
-       rows_left_in_batch -= window_height;
-     }
-   };
- 
-   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-   // would be memory cache line sized access, but since other blocks will read/write the edges
-   // this may not turn out to be overly important. For now, we will attempt to build a square
-   // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
-   // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
-   // trick is that it's in bytes, not rows or columns.
-   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-   int const window_height = std::clamp(
-       util::round_up_safe<int>(
-           std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
-                    total_number_of_rows),
-           32),
-       1, row_batches[0].row_count);
- 
-   auto calc_admin_data_size = [](int num_cols) -> size_type {
-     // admin data is the column sizes and column start information.
-     // this is copied to shared memory as well and needs to be accounted for
-     // in the window calculation.
-     return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
-   };
- 
-   int row_size = 0;
- 
-   // march each column and build the blocks of appropriate sizes
-   for (unsigned int col = 0; col < column_sizes.size(); ++col) {
-     auto const col_size = column_sizes[col];
- 
-     // align size for this type
-     std::size_t alignment_needed = col_size; // They are the same for fixed width types
-     auto row_size_aligned = detail::align_offset(row_size, alignment_needed);
-     auto row_size_with_this_col = row_size_aligned + col_size;
-     auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
- 
-     if (row_size_with_end_pad * window_height +
-             calc_admin_data_size(col - current_window_start_col) >
-         shmem_limit_per_block) {
-       // too large, close this window, generate vertical blocks and restart
-       build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
-       row_size =
-           detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-       row_size += col_size; // alignment required for shared memory window boundary to match
-                             // alignment of output row
-       current_window_start_col = col;
-       current_window_width = 0;
-     } else {
-       row_size = row_size_with_this_col;
-       current_window_width++;
-     }
-   }
- 
-   // build last set of blocks
-   if (current_window_width > 0) {
-     build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
-   }
- 
-   return block_infos;
- }
- 
- #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- 
- } // namespace detail
- 
- std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr) {
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-   // Potential optimization for window sizes.
-   const size_type num_columns = tbl.num_columns();
-   const size_type num_rows = tbl.num_rows();
- 
-   int device_id;
-   CUDA_TRY(cudaGetDevice(&device_id));
-   int total_shmem;
-   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
- 
-   // TODO: why?
-   total_shmem -= 1024;
-   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
- 
-   // break up the work into blocks, which are a starting and ending row/col #.
-   // this window size is calculated based on the shared memory size available
-   // we want a single block to fill up the entire shared memory space available
-   // for the transpose-like conversion.
- 
-   // There are two different processes going on here. The GPU conversion of the data
-   // and the writing of the data into the list of byte columns that are a maximum of
-   // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
-   // this limitation because the column must own the data inside and as a result it must be
-   // a distinct allocation for that column. Copying the data into these final buffers would
-   // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
-   // The windows are broken at the boundaries of specific rows based on the row sizes up
-   // to that point. These are row batches and they are decided first before building the
-   // windows so the windows can be properly cut around them.
- 
-   // Get the pointers to the input columnar data ready
-   std::vector<int8_t const *> input_data;
-   std::vector<bitmask_type const *> input_nm;
-   input_data.reserve(num_columns);
-   input_nm.reserve(num_columns);
-   for (size_type column_number = 0; column_number < num_columns; column_number++) {
-     column_view cv = tbl.column(column_number);
-     auto const col_type = cv.type();
-     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
- 
-     if (!nested_type) {
-       input_data.emplace_back(cv.data<int8_t>());
-       input_nm.emplace_back(cv.null_mask());
-     }
-   }
- 
-   auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-   auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
- 
-   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
-   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
-   std::vector<size_type> column_sizes;  // byte size of each column
-   std::vector<size_type> column_starts; // offset of column inside a row including alignment
-   std::vector<column_view>
-       variable_width_columns; // list of the variable width columns in the table
-   row_sizes.reserve(num_rows);
-   row_offsets.reserve(num_rows);
-   column_sizes.reserve(num_columns);
-   column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
- 
-   auto iter =
-       thrust::make_transform_iterator(thrust::make_counting_iterator(0),
-                                       [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-                                         return std::make_tuple(tbl.column(i).type(), tbl.column(i));
-                                       });
- 
-   size_type fixed_width_size_per_row =
-       detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
- 
-   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
-   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
- 
-   std::vector<detail::row_batch> row_batches;
- 
-   uint64_t row_batch_size = 0;
-   uint64_t total_table_size = 0;
-   size_type row_batch_rows = 0;
-   uint64_t row_offset = 0;
- 
-   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
-   // calculate the size of each row's variable-width data and validity as well.
-   auto validity_size = num_bitmask_words(num_columns) * 4;
-   // thrust
-   for (int row = 0; row < num_rows; ++row) {
-     auto aligned_row_batch_size =
-         detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
-     row_sizes[row] = fixed_width_size_per_row;
-     // validity is byte aligned
-     row_sizes[row] += validity_size;
-     // variable width data is 8-byte aligned
-     row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned
- 
-     if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
-         (uint64_t)std::numeric_limits<size_type>::max()) {
-       // a new batch starts at the last 32-row boundary
-       row_batches.push_back(
-           detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-       row_batch_size = 0;
-       row_batch_rows = row_batch_rows & 31;
-       row_offset = 0;
-       aligned_row_batch_size = 0;
-     }
-     row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
-     row_offsets.push_back(row_offset);
-     row_batch_size = aligned_row_batch_size + row_sizes[row];
-     row_offset += row_sizes[row];
-     total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
-     total_table_size += row_sizes[row];
-     row_batch_rows++;
-   }
-   if (row_batch_size > 0) {
-     row_batches.push_back(
-         detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
-   }
- 
-   auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
- 
-   std::vector<rmm::device_buffer> output_buffers;
-   std::vector<int8_t *> output_data;
-   output_data.reserve(row_batches.size());
-   for (uint i = 0; i < row_batches.size(); ++i) {
-     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-     output_data.push_back(static_cast<int8_t *>(temp.data()));
-     output_buffers.push_back(std::move(temp));
-   }
-   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
- 
-   std::vector<detail::block_info> block_infos =
-       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
- 
-   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
- 
-   // blast through the entire table and convert it
-   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
-   dim3 threads(256);
- 
-   detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-       num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
-       dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
-       reinterpret_cast<int8_t **>(dev_output_data.data()));
- 
-   auto validity_block_infos =
-       build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
- 
-   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
-   dim3 validity_blocks(
-       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
-   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-   detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem,
-                                        stream.value()>>>(
-       num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
-       column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
-       dev_input_nm.data());
- 
-   // split up the output buffer into multiple buffers based on row batch sizes
-   // and create list of byte columns
-   int offset_offset = 0;
-   std::vector<std::unique_ptr<cudf::column>> ret;
-   for (uint i = 0; i < row_batches.size(); ++i) {
-     // compute offsets for this row batch
-     std::vector<size_type> offset_vals;
-     offset_vals.reserve(row_batches[i].row_count + 1);
-     size_type cur_offset = 0;
-     offset_vals.push_back(cur_offset);
-     for (int row = 0; row < row_batches[i].row_count; ++row) {
-       cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
-       offset_vals.push_back(cur_offset);
-     }
-     offset_offset += row_batches[i].row_count;
- 
-     auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
-     auto offsets = std::make_unique<column>(data_type{type_id::INT32},
-                                             (size_type)offset_vals.size(), dev_offsets.release());
- 
-     auto data = std::make_unique<column>(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes,
-                                          std::move(output_buffers[i]));
- 
-     ret.push_back(
-         cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0,
-                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
-   }
- 
-   return ret;
- #else
-   CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
-   return {};
- #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- }
- 
- std::vector<std::unique_ptr<cudf::column>>
- convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource *mr) {
-   const cudf::size_type num_columns = tbl.num_columns();
- 
-   std::vector<cudf::data_type> schema;
-   schema.resize(num_columns);
-   std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
- 
-   if (detail::are_all_fixed_width(schema)) {
-     std::vector<cudf::size_type> column_start;
-     std::vector<cudf::size_type> column_size;
- 
-     int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
-     auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
-     auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
- 
-     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
-     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
-     // splitting validity at a specific row offset.  This might change in the future.
-     max_rows_per_batch = (max_rows_per_batch / 32) * 32;
- 
-     cudf::size_type num_rows = tbl.num_rows();
- 
-     // Get the pointers to the input columnar data ready
-     std::vector<const int8_t *> input_data;
-     std::vector<cudf::bitmask_type const *> input_nm;
-     for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
-       cudf::column_view cv = tbl.column(column_number);
-       input_data.emplace_back(cv.data<int8_t>());
-       input_nm.emplace_back(cv.null_mask());
-     }
-     auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-     auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
- 
-     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
-     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-     zero->set_valid_async(true, stream);
-     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
- 
-     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-     step->set_valid_async(true, stream);
-     static_cast<ScalarType *>(step.get())
-         ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
- 
-     std::vector<std::unique_ptr<cudf::column>> ret;
-     for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
-       cudf::size_type row_count = num_rows - row_start;
-       row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-       ret.emplace_back(detail::fixed_width_convert_to_rows(
-           row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
-           dev_input_data, dev_input_nm, *zero, *step, stream, mr));
-     }
- 
-     return ret;
-   } else {
-     CUDF_FAIL("Only fixed width types are currently supported");
-   }
- }
- 
- std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
-                                                std::vector<cudf::data_type> const &schema,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource *mr) {
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-   // verify that the types are what we expect
-   cudf::column_view child = input.child();
-   cudf::type_id list_type = child.type().id();
-   CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-                "Only a list of bytes is supported as input");
- 
-   cudf::size_type num_columns = schema.size();
-   cudf::size_type num_rows = input.parent().size();
- 
-   int device_id;
-   CUDA_TRY(cudaGetDevice(&device_id));
-   int total_shmem;
-   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
- 
-   // TODO why?
-   total_shmem -= 1024;
-   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
- 
-   std::vector<cudf::size_type> column_starts;
-   std::vector<cudf::size_type> column_sizes;
- 
-   auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
-     return std::make_tuple(schema[i], nullptr);
-   });
-   size_type fixed_width_size_per_row = detail::compute_column_information(
-       iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {});
- 
-   size_type validity_size = num_bitmask_words(num_columns) * 4;
- 
-   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
- 
-   // Ideally we would check that the offsets are all the same, etc. but for now
-   // this is probably fine
-   CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
-   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
- 
-   // build the row_batches from the passed in list column
-   std::vector<detail::row_batch> row_batches;
- 
-   row_batches.push_back(detail::row_batch{child.size(), num_rows});
- 
-   // Allocate the columns we are going to write into
-   std::vector<std::unique_ptr<cudf::column>> output_columns;
-   std::vector<int8_t *> output_data;
-   std::vector<cudf::bitmask_type *> output_nm;
-   for (cudf::size_type i = 0; i < num_columns; i++) {
-     auto column = cudf::make_fixed_width_column(schema[i], num_rows,
-                                                 cudf::mask_state::UNINITIALIZED, stream, mr);
-     auto mut = column->mutable_view();
-     output_data.emplace_back(mut.data<int8_t>());
-     output_nm.emplace_back(mut.null_mask());
-     output_columns.emplace_back(std::move(column));
-   }
- 
-   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-   auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
- 
-   std::vector<detail::block_info> block_infos =
-       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
- 
-   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
- 
-   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
-   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
-   detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-       dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
-       block_infos.size(), child.data<int8_t>());
- 
-   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-   auto const column_stride = [&]() {
-     if (desired_rows_and_columns > num_columns) {
-       // not many columns, group it into 64s and ship it off
-       return std::min(64, num_columns);
-     } else {
-       return util::round_down_safe(desired_rows_and_columns, 8);
-     }
-   }();
-   auto const row_stride = [&]() {
-     // we fit as much as we can, we know the column stride now, so calculate the row
-     return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
-     /*    if (desired_rows_and_columns > num_rows) {
-           return std::min(32, num_rows);
-         } else {
-           return util::round_down_safe(desired_rows_and_columns, 32);
-         }*/
-   }();
-   std::vector<detail::block_info> validity_block_infos;
-   for (int col = 0; col < num_columns; col += column_stride) {
-     for (int row = 0; row < num_rows; row += row_stride) {
-       validity_block_infos.emplace_back(
-           detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
-                              std::min(row + row_stride - 1, num_rows - 1)});
-     }
-   }
-   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
-   dim3 validity_blocks(
-       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
- 
-   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-   detail::
-       copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-           dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
-           validity_block_infos.size(), child.data<int8_t>());
- 
-   return std::make_unique<cudf::table>(std::move(output_columns));
- #else
-   CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
-   return {};
- #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- }
- 
- std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
-     cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
-     rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
-   // verify that the types are what we expect
-   cudf::column_view child = input.child();
-   cudf::type_id list_type = child.type().id();
-   CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-                "Only a list of bytes is supported as input");
- 
-   cudf::size_type num_columns = schema.size();
- 
-   if (detail::are_all_fixed_width(schema)) {
-     std::vector<cudf::size_type> column_start;
-     std::vector<cudf::size_type> column_size;
- 
-     cudf::size_type num_rows = input.parent().size();
-     int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
- 
-     // Ideally we would check that the offsets are all the same, etc. but for now
-     // this is probably fine
-     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
-                  "The layout of the data appears to be off");
-     auto dev_column_start = make_device_uvector_async(column_start, stream);
-     auto dev_column_size = make_device_uvector_async(column_size, stream);
- 
-     // Allocate the columns we are going to write into
-     std::vector<std::unique_ptr<cudf::column>> output_columns;
-     std::vector<int8_t *> output_data;
-     std::vector<cudf::bitmask_type *> output_nm;
-     for (cudf::size_type i = 0; i < num_columns; i++) {
-       auto column = cudf::make_fixed_width_column(schema[i], num_rows,
-                                                   cudf::mask_state::UNINITIALIZED, stream, mr);
-       auto mut = column->mutable_view();
-       output_data.emplace_back(mut.data<int8_t>());
-       output_nm.emplace_back(mut.null_mask());
-       output_columns.emplace_back(std::move(column));
-     }
- 
-     auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-     auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
- 
-     dim3 blocks;
-     dim3 threads;
-     int shared_size =
-         detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
- 
-     detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-         num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
-         dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
- 
-     return std::make_unique<cudf::table>(std::move(output_columns));
-   } else {
-     CUDF_FAIL("Only fixed width types are currently supported");
-   }
- }
- 
- } // namespace cudf
- 
\ No newline at end of file
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 2da28425c9e..088b0b747fb 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -330,10 +330,6 @@ ConfigureTest(RESHAPE_TEST
     reshape/interleave_columns_tests.cpp
     reshape/tile_tests.cpp)
 
-###################################################################################################
-# - row conversion test ----------------------------------------------------------------------------------
-ConfigureTest(ROW_CONVERSION_TEST row_conversion/row_conversion.cpp)
-
 ###################################################################################################
 # - traits test -----------------------------------------------------------------------------------
 ConfigureTest(TRAITS_TEST types/traits_test.cpp)
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
deleted file mode 100644
index b807b5cec81..00000000000
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ /dev/null
@@ -1,677 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/row_conversion.hpp>
-#include <cudf/types.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <limits>
-
-struct ColumnToRowTests : public cudf::test::BaseFixture {
-};
-struct RowToColumnTests : public cudf::test::BaseFixture {
-};
-
-TEST_F(ColumnToRowTests, Single)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Simple)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Tall)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Wide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, SingleByteWide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
-    views.push_back(cols.back());
-
-    schema.push_back(cudf::data_type{cudf::type_id::INT8});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Non2Power)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  constexpr auto num_rows = 6 * 1024 + 557;
-  for (int i = 0; i < 131; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Big)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 28; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Bigger)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 128 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Biggest)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 128 columns of 2 million rows
-  constexpr auto num_rows = 2 * 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Single)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Simple)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Tall)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Wide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({i}));  // rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, SingleByteWide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, AllTypes)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT64},
-                                      cudf::data_type{cudf::type_id::FLOAT64},
-                                      cudf::data_type{cudf::type_id::INT8},
-                                      cudf::data_type{cudf::type_id::BOOL8},
-                                      cudf::data_type{cudf::type_id::FLOAT32},
-                                      cudf::data_type{cudf::type_id::INT8},
-                                      cudf::data_type{cudf::type_id::INT32},
-                                      cudf::data_type{cudf::type_id::INT64}};
-
-  cudf::test::fixed_width_column_wrapper<int64_t> c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<double> c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0},
-                                                    {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<int8_t> c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> c3({true, false, false, true, false, false},
-                                                  {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<float> c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f},
-                                                   {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<int8_t> c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_point_column_wrapper<int32_t> c6(
-    {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2});
-  cudf::test::fixed_point_column_wrapper<int64_t> c7(
-    {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1});
-
-  cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7});
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, AllTypesLarge)
-{
-  std::vector<cudf::column> cols;
-  std::vector<cudf::data_type> schema{};
-
-  // 10 columns of each type with 1024 entries
-  constexpr int num_rows{1024};
-
-  std::default_random_engine re;
-  std::uniform_real_distribution<double> rand_double(std::numeric_limits<double>::min(),
-                                                     std::numeric_limits<double>::max());
-  std::uniform_int_distribution<int64_t> rand_int64(std::numeric_limits<int64_t>::min(),
-                                                    std::numeric_limits<int64_t>::max());
-  auto r = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) -> int64_t { return rand_int64(re); });
-  auto d = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) -> double { return rand_double(re); });
-
-  auto all_valid  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
-  auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
-  auto most_valid = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; });
-  auto few_valid = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; });
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows, all_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::INT8});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows, few_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::INT16});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    if (i < 5) {
-      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, few_valid)
-                        .release()
-                        .release());
-    } else {
-      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, none_valid)
-                        .release()
-                        .release());
-    }
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows, most_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::FLOAT32});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows, most_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::FLOAT64});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows, few_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::BOOL8});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
-         r, r + num_rows, all_valid)
-         .release()
-         .release());
-    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>(
-         r, r + num_rows, most_valid)
-         .release()
-         .release());
-    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_point_column_wrapper<int32_t>(
-                      r, r + num_rows, all_valid, numeric::scale_type{-2})
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_point_column_wrapper<int64_t>(
-                      r, r + num_rows, most_valid, numeric::scale_type{-1})
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64});
-  }
-
-  std::vector<cudf::column_view> views(cols.begin(), cols.end());
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Non2Power)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  constexpr auto num_rows = 6 * 1024 + 557;
-  for (int i = 0; i < 131; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Big)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 28; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Bigger)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Biggest)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 5 * 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 932afa4bb70..f5936e86bcd 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -50,7 +50,7 @@
 #include <thrust/iterator/transform_iterator.h>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
@@ -409,7 +409,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
       auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
 
       // wait for the last use of the memory to be completed
-      if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) {
+      if (fetch >= NUM_BLOCKS_PER_KERNEL_LOADED) {
         fetch_barrier.arrive_and_wait();
       }
 
@@ -525,7 +525,7 @@ __global__ void copy_validity_to_rows(
   group.sync();
 
   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+    if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
       shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
           .arrive_and_wait();
     }
@@ -645,10 +645,10 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num
  *
  */
 __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
-                                const size_type shmem_used_per_block, const size_type *row_offsets,
-                                int8_t **output_data, const size_type *_col_sizes,
-                                const size_type *_col_offsets, const block_info *block_infos,
-                                const size_type num_block_infos, const int8_t *input_data) {
+                               const size_type shmem_used_per_block, const size_type *row_offsets,
+                               int8_t **output_data, const size_type *_col_sizes,
+                               const size_type *_col_offsets, const block_info *block_infos,
+                               const size_type num_block_infos, const int8_t *input_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -819,8 +819,8 @@ __global__ void copy_validity_from_rows(
   group.sync();
 
   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-    if (validity_block != validity_index) {
+    if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+      auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
       shared_block_barriers[validity_index].arrive_and_wait();
     }
     int8_t *this_shared_block = shared_blocks[validity_block % 2];
@@ -1251,7 +1251,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
   // TODO: why?
   total_shmem -= 1024;
-  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+  int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
@@ -1368,7 +1368,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   std::vector<detail::block_info> block_infos =
       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream);
 
   // blast through the entire table and convert it
   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
@@ -1382,12 +1382,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   auto validity_block_infos =
       build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
 
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream);
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-  detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem,
-                                       stream.value()>>>(
+  detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
       column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
       dev_input_nm.data());
@@ -1508,7 +1507,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 
   // TODO why?
   total_shmem -= 1024;
-  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+  int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
   std::vector<cudf::size_type> column_starts;
   std::vector<cudf::size_type> column_sizes;
@@ -1590,7 +1589,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   }
   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
 
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
   detail::

From 131ca58fdfe2dfe7d0298d83a33d8e17ee41c34d Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Sat, 23 Oct 2021 01:37:52 +0000
Subject: [PATCH 29/80] removing unused header, suppressing shared warning for
 barrier, updating java bindings to use the correct namespace

---
 cpp/include/cudf/row_conversion.hpp        | 51 ----------------------
 java/src/main/native/src/TableJni.cpp      |  9 ++--
 java/src/main/native/src/row_conversion.cu |  6 ++-
 3 files changed, 9 insertions(+), 57 deletions(-)
 delete mode 100644 cpp/include/cudf/row_conversion.hpp

diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
deleted file mode 100644
index 5d799f4c596..00000000000
--- a/cpp/include/cudf/row_conversion.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <memory>
-
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/table/table_view.hpp>
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
-  cudf::table_view const& tbl,
-  // TODO need something for validity
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
-  cudf::table_view const& tbl,
-  // TODO need something for validity
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
-  cudf::lists_column_view const& input,
-  std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows(
-  cudf::lists_column_view const& input,
-  std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-}  // namespace cudf
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index d0e6b895a1e..28a12c36b4e 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -35,7 +35,6 @@
 #include <cudf/replace.hpp>
 #include <cudf/reshape.hpp>
 #include <cudf/rolling.hpp>
-#include <cudf/row_conversion.hpp>
 #include <cudf/search.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
@@ -2701,7 +2700,7 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols =
-        cudf::convert_to_rows_fixed_width_optimized(*n_input_table);
+        cudf::java::convert_to_rows_fixed_width_optimized(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2719,7 +2718,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   try {
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::convert_to_rows(*n_input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::java::convert_to_rows(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2746,7 +2745,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
     std::unique_ptr<cudf::table> result =
-        cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec);
+        cudf::java::convert_from_rows_fixed_width_optimized(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
@@ -2769,7 +2768,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::unique_ptr<cudf::table> result = cudf::convert_from_rows(list_input, types_vec);
+    std::unique_ptr<cudf::table> result = cudf::java::convert_from_rows(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index f5936e86bcd..af26e4c0b0d 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -34,7 +34,6 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
-#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
@@ -49,12 +48,17 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
+#include "row_conversion.hpp"
+
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
+
+// needed to suppress warning about cuda::barrier
+#pragma diag_suppress static_var_with_dynamic_init
 #endif
 
 using cudf::detail::make_device_uvector_async;

From d013e8b2e1182c29aac2783f3999fd86aa9087b8 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Sat, 30 Oct 2021 01:00:38 +0000
Subject: [PATCH 30/80] updating code to build block infos with thrust on the
 gpu

---
 java/src/main/native/src/row_conversion.cu | 670 +++++++++++++--------
 1 file changed, 418 insertions(+), 252 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index af26e4c0b0d..87ab1ed49d8 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -21,6 +21,8 @@
 #include <tuple>
 
 #include <cooperative_groups.h>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
@@ -34,6 +36,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
+#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
@@ -47,8 +50,7 @@
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-
-#include "row_conversion.hpp"
+#include <thrust/scan.h>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;
@@ -64,7 +66,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 using cudf::detail::make_device_uvector_async;
 using rmm::device_uvector;
 namespace cudf {
-
+namespace java {
 namespace detail {
 
 static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) {
@@ -324,6 +326,11 @@ __global__ void copy_to_rows_fixed_width_optimized(
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
+/**
+ * @brief The GPU blocks work on one or more block_info structs of data.
+ *        This structure defined the workspace for the block.
+ *
+ */
 struct block_info {
   int start_col;
   int start_row;
@@ -340,38 +347,36 @@ struct block_info {
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
 };
 
-// When building the columns to return, we have to be mindful of the offset limit in cudf.
-// It is 32-bit and these data columns are capable of surpassing that easily. The data should
-// not be cut off exactly at the limit though due to the validity buffers. The most efficient
-// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-// we keep track of the cut points for the validity, which we call row batches. If the row
-// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-// the outgoing data stream.
+/**
+ * @brief Returning rows is done in a byte cudf column. This is limited in size by
+ *        `size_type` and so output is broken into batches of rows that fit inside
+ *        this limit.
+ *
+ */
 struct row_batch {
   size_type num_bytes;
   size_type row_count;
+  device_uvector<size_type> row_offsets;
 };
 
 /**
- * @brief copy data from cudf columns into x format, which is row-based
+ * @brief copy data from cudf columns into JCUDF format, which is row-based
  *
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block shared memory amount each `block_info` is using
+ * @param block_infos span of `block_info` structs the define the work
  * @param input_data pointer to raw table data
- * @param input_nm pointer to validity data
  * @param col_sizes array of sizes for each element in a column - one per column
  * @param col_offsets offset into input data row for each column's start
- * @param block_infos information about the blocks of work
  * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data
  *
  */
 __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
-                             const size_type shmem_used_per_block, const size_type num_block_infos,
-                             const int8_t **input_data, const size_type *col_sizes,
-                             const size_type *col_offsets, const block_info *block_infos,
+                             const size_type shmem_used_per_block,
+                             device_span<const block_info> block_infos, const int8_t **input_data,
+                             const size_type *col_sizes, const size_type *col_offsets,
                              const size_type *row_offsets, int8_t **output_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -396,7 +401,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
   group.sync();
 
   auto const blocks_remaining =
-      std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
+      std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
                (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS);
 
   size_t fetch;
@@ -491,23 +496,25 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
+ * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data, partitioned by data size
  * @param validity_offsets offset into input data row for validity data
  * @param block_infos information about the blocks of work
- * @param num_block_infos number of infos in blocks array
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_validity_to_rows(
-    const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
-    const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
-    const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
+__global__ void copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
+                                      const size_type shmem_used_per_block,
+                                      const size_type *row_offsets, int8_t **output_data,
+                                      const size_type validity_offset,
+                                      device_span<const block_info> block_infos,
+                                      const bitmask_type **input_nm) {
   extern __shared__ int8_t shared_data[];
   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
       shared_data, shared_data + shmem_used_per_block / 2};
 
-  // per conversation with DaveB
+  using cudf::detail::warp_size;
+
   // each thread of warp reads a single int32 of validity - so we read 128 bytes
   // then ballot_sync the bits and write the result to shmem
   // after we fill shared mem memcpy it out in a blob.
@@ -515,7 +522,7 @@ __global__ void copy_validity_to_rows(
   auto group = cooperative_groups::this_thread_block();
 
   int const blocks_remaining =
-      std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+      std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
 
   __shared__ cuda::barrier<cuda::thread_scope_block>
@@ -545,9 +552,9 @@ __global__ void copy_validity_to_rows(
         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
 
-    int const warp_id = threadIdx.x / detail::warp_size;
-    int const lane_id = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
@@ -567,7 +574,7 @@ __global__ void copy_validity_to_rows(
                            input_nm[absolute_col][absolute_row / 32] :
                            std::numeric_limits<uint32_t>::max();
 
-        // every thread that is participating in the warp has a byte, but it's column-based
+        // every thread that is participating in the warp has 4 bytes, but it's column-based
         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
         // make the bytes we actually write.
         bitmask_type dw_mask = 1;
@@ -576,7 +583,7 @@ __global__ void copy_validity_to_rows(
           // lead thread in each warp writes data
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
-          if (threadIdx.x % detail::warp_size == 0) {
+          if (threadIdx.x % warp_size == 0) {
             if (cols_left <= 8) {
               // write byte
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
@@ -625,6 +632,14 @@ __global__ void copy_validity_to_rows(
   }
 }
 
+/**
+ * @brief Admin data is data stored in shared memory that isn't actual column data
+ *
+ * @param col_size_size size of the column size data.
+ * @param col_offset_size size of the column offset data.
+ * @param num_cols number of columns in the block.
+ * @return tuple of the size of column and offset admin data.
+ */
 static __device__ std::tuple<size_type, size_type>
 get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) {
   auto const col_size_bytes = num_cols * col_size_size;
@@ -639,9 +654,8 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param row_offsets
- * @param output_data
- * @param output_nm
+ * @param row_offsets offset to a specific row in the input data
+ * @param output_data pointers to column data
  * @param col_sizes array of sizes for each element in a column - one per column
  * @param col_offsets offset into input data row for each column's start
  * @param block_infos information about the blocks of work
@@ -651,8 +665,9 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num
 __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
                                const size_type shmem_used_per_block, const size_type *row_offsets,
                                int8_t **output_data, const size_type *_col_sizes,
-                               const size_type *_col_offsets, const block_info *block_infos,
-                               const size_type num_block_infos, const int8_t *input_data) {
+                               const size_type *_col_offsets,
+                               device_span<const block_info> block_infos,
+                               const int8_t *input_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -678,8 +693,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
 
   group.sync();
 
-  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
-                                   (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
+  auto blocks_remaining =
+      std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
+               (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
 
   size_t fetch_index;
   size_t processing_index;
@@ -785,23 +801,24 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
- * @param output_nm
+ * @param row_offsets offset to a specific row in the input data
+ * @param output_nm pointers to null masks for columns
  * @param validity_offsets offset into input data row for validity data
  * @param block_infos information about the blocks of work
- * @param num_block_infos number of infos in blocks array
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_validity_from_rows(
-    const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
-    const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
-    const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
+__global__ void
+copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
+                        const size_type shmem_used_per_block, const size_type *row_offsets,
+                        cudf::bitmask_type **output_nm, const size_type validity_offset,
+                        device_span<const block_info> block_infos, const int8_t *input_data) {
   extern __shared__ int8_t shared_data[];
   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
       shared_data, shared_data + shmem_used_per_block / 2};
 
-  // per conversation with DaveB
+  using cudf::detail::warp_size;
+
   // each thread of warp reads a single byte of validity - so we read 32 bytes
   // then ballot_sync the bits and write the result to shmem
   // after we fill shared mem memcpy it out in a blob.
@@ -809,7 +826,7 @@ __global__ void copy_validity_from_rows(
   auto group = cooperative_groups::this_thread_block();
 
   int const blocks_remaining =
-      std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+      std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
 
   __shared__ cuda::barrier<cuda::thread_scope_block>
@@ -837,14 +854,14 @@ __global__ void copy_validity_from_rows(
     auto const num_sections_y = (num_block_rows + 31) / 32;
     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections = num_sections_x * num_sections_y;
-    int const warp_id = threadIdx.x / detail::warp_size;
-    int const lane_id = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
          my_section_idx += warps_per_block) {
-      // convert to rows and cols
+      // convert section to row and col
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
       auto const relative_col = section_x * 8;
@@ -860,13 +877,13 @@ __global__ void copy_validity_from_rows(
             input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
 
         // so every thread that is participating in the warp has a byte, but it's row-based
-        // data and we need it in column-based. So we shiffle the bits around to make
+        // data and we need it in column-based. So we shuffle the bits around to make
         // the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
              ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
-          if (threadIdx.x % detail::warp_size == 0) {
+          if (threadIdx.x % warp_size == 0) {
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
 
@@ -898,10 +915,10 @@ __global__ void copy_validity_from_rows(
     // now async memcpy the shared
     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
       auto const relative_col = col - block.start_col;
+      auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       cuda::memcpy_async(
-          output_nm[col] + word_index(block_start_row),
-          &this_shared_block[validity_data_col_length * relative_col],
+          starting_address, &this_shared_block[validity_data_col_length * relative_col],
           util::div_rounding_up_unsafe(num_block_rows, 8),
           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
     }
@@ -919,7 +936,8 @@ __global__ void copy_validity_from_rows(
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
 /**
- * Calculate the dimensions of the kernel for fixed width only columns.
+ * @brief Calculate the dimensions of the kernel for fixed width only columns.
+ *
  * @param [in] num_columns the number of columns being copied.
  * @param [in] num_rows the number of rows being copied.
  * @param [in] size_per_row the size each row takes up when padded.
@@ -995,7 +1013,7 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty
                             rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
                             const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row,
                             rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
-  int64_t total_allocation = size_per_row * num_rows;
+  int64_t const total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
   CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
 
@@ -1020,17 +1038,14 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty
                                  rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
 }
 
-static cudf::data_type get_data_type(const cudf::column_view &v) {
-  return v.type();
-}
-
 static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
   return std::all_of(schema.begin(), schema.end(),
                      [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
 }
 
 /**
- * Given a set of fixed width columns, calculate how the data will be laid out in memory.
+ * @brief Given a set of fixed width columns, calculate how the data will be laid out in memory.
+ *
  * @param [in] schema the types of columns that need to be laid out.
  * @param [out] column_start the byte offset where each column starts in the row.
  * @param [out] column_size the size in bytes of the data for each columns in the row.
@@ -1065,19 +1080,25 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
+/**
+ * @brief Compute information about a table such as bytes per row and offsets.
+ *
+ * @tparam iterator iterator of column schema data
+ * @param begin starting iterator of column schema
+ * @param end ending iterator of column schema
+ * @param column_starts column start offsets
+ * @param column_sizes size in bytes of each column
+ * @return size of the fixed_width data portion of a row.
+ */
 template <typename iterator>
 static size_type compute_column_information(iterator begin, iterator end,
                                             std::vector<size_type> &column_starts,
-                                            std::vector<size_type> &column_sizes) //,
-// std::function<void(T)> nested_type_cb)
-{
+                                            std::vector<size_type> &column_sizes) {
   size_type fixed_width_size_per_row = 0;
   for (auto cv = begin; cv != end; ++cv) {
     auto col_type = std::get<0>(*cv);
     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
-    //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
-
     // a list or string column will write a single uint64
     // of data here for offset/length
     auto col_size = nested_type ? 8 : size_of(col_type);
@@ -1096,6 +1117,15 @@ static size_type compute_column_information(iterator begin, iterator end,
   return fixed_width_size_per_row;
 }
 
+/**
+ * @brief Build `block_info` for the validity data to break up the work.
+ *
+ * @param num_columns number of columns in the table
+ * @param num_rows number of rows in the table
+ * @param shmem_limit_per_block size of shared memory available to a single gpu block
+ * @param row_batches batched row information for multiple output locations
+ * @return vector of `block_info` structs for validity data
+ */
 std::vector<detail::block_info>
 build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
                            size_type const &shmem_limit_per_block,
@@ -1139,43 +1169,202 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro
   return validity_block_infos;
 }
 
-std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
-                                          std::vector<size_type> const &column_starts,
-                                          std::vector<row_batch> const &row_batches,
-                                          size_type const total_number_of_rows,
-                                          size_type const &shmem_limit_per_block) {
-  std::vector<block_info> block_infos;
+constexpr size_type max_batch_size = std::numeric_limits<size_type>::max();
+
+/**
+ * @brief Holds information about the batches of data to be processed
+ *
+ */
+struct batch_data {
+  std::vector<size_type> batch_row_boundaries;
+  device_uvector<size_type> input_data_row_offsets;
+  std::vector<row_batch> row_batches;
+
+  batch_data(size_type num_input_offsets, rmm::cuda_stream_view stream)
+      : input_data_row_offsets(num_input_offsets, stream){};
+};
 
+/**
+ * @brief Builds batches of rows that will fit in the size limit of a column.
+ *
+ * @tparam RowSize iterator that gives the size of a specific row of the table.
+ * @param num_rows Total number of rows in the table
+ * @param row_sizes iterator that gives the size of a specific row of the table.
+ * @param stream stream to operate on for this work
+ * @param mr memory resource used to allocate any returned data
+ * @returns vector of size_type's that indicate row numbers for batch boundaries and a
+ * device_uvector of row offsets
+ */
+
+template <typename RowSize>
+batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource *mr) {
+  auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
+  auto const num_batches = util::div_rounding_up_safe(total_size, max_batch_size);
+  auto const num_offsets = num_batches + 1;
+  batch_data ret(num_rows + 1, stream);
+
+  // at most max gpu memory / 2GB iterations.
+  ret.batch_row_boundaries.reserve(num_offsets);
+  ret.batch_row_boundaries.push_back(0);
+  size_type last_row_end = 0;
+  device_uvector<size_type> cumulative_row_sizes(num_rows, stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
+                         cumulative_row_sizes.begin());
+  while ((int)ret.batch_row_boundaries.size() < num_offsets) {
+    // find the next max_batch_size boundary
+    size_type const row_end =
+        ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(),
+                              cumulative_row_sizes.begin() + (num_rows - last_row_end),
+                              max_batch_size) -
+          cumulative_row_sizes.begin()) +
+         last_row_end);
+
+    // build offset list for each row in this batch
+    auto const num_entries = row_end - last_row_end + 1;
+    device_uvector<size_type> output_batch_row_offsets(num_entries, stream, mr);
+
+    auto row_size_iter_bounded = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0), [row_end, row_sizes, last_row_end] __device__(auto i) {
+          return i >= row_end ? 0 : row_sizes[i + last_row_end];
+        });
+
+    thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded,
+                           row_size_iter_bounded + num_entries, output_batch_row_offsets.begin());
+
+    ret.batch_row_boundaries.push_back(row_end);
+    auto const batch_bytes = output_batch_row_offsets.element(row_end, stream) -
+                             output_batch_row_offsets.element(last_row_end, stream);
+    auto const num_rows_in_batch = row_end - last_row_end;
+    ret.row_batches.push_back(
+        {batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)});
+    last_row_end = row_end;
+  }
+
+  auto row_size_iter = cudf::detail::make_counting_transform_iterator(
+      0, [row_sizes, num_rows] __device__(auto i) { return (i < num_rows) ? row_sizes[i] : 0; });
+  thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter, row_size_iter + num_rows + 1,
+                         ret.input_data_row_offsets.begin());
+
+  return ret;
+}
+
+/**
+ * @brief Computes the number of blocks necessary given a window height and batch offsets
+ *
+ * @param batch_row_offsets row offsets for each batch
+ * @param desired_window_height height of each window in the table
+ * @param stream stream to use
+ * @return number of windows necessary
+ */
+int compute_block_counts(device_span<size_type const> const &batch_row_offsets,
+                         int desired_window_height, rmm::cuda_stream_view stream) {
+  size_type const num_batches = batch_row_offsets.size() - 1;
+  device_uvector<size_type> num_blocks(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(
+      rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
+      [desired_window_height,
+       batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
+        return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] -
+                                                batch_row_offsets[batch_index],
+                                            desired_window_height);
+      });
+  return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+}
+
+/**
+ * @brief Builds the `block_info` structs for a given table.
+ *
+ * @param blocks span of blocks to populate
+ * @param batch_row_offsets offsets to row batches
+ * @param column_start starting column of the window
+ * @param column_end ending column of the window
+ * @param desired_window_height height of the window
+ * @param total_number_of_rows total number of rows in the table
+ * @param stream stream to use
+ * @return number of windows created
+ */
+size_type
+build_blocks(device_span<block_info> blocks,
+             device_uvector<size_type> const &batch_row_offsets, // comes from build_batches
+             int column_start, int column_end, int desired_window_height, int total_number_of_rows,
+             rmm::cuda_stream_view stream) {
+  size_type const num_batches = batch_row_offsets.size() - 1;
+  device_uvector<size_type> num_blocks(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(
+      rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
+      [desired_window_height,
+       batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
+        return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] -
+                                                batch_row_offsets[batch_index],
+                                            desired_window_height);
+      });
+
+  size_type const total_blocks =
+      thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+
+  device_uvector<size_type> block_starts(num_batches + 1, stream);
+  auto block_iter = cudf::detail::make_counting_transform_iterator(
+      0, [num_blocks = num_blocks.data(), num_batches] __device__(auto i) {
+        return (i < num_batches) ? num_blocks[i] : 0;
+      });
+  thrust::exclusive_scan(rmm::exec_policy(stream), block_iter, block_iter + num_batches + 1,
+                         block_starts.begin()); // in blocks
+
+  thrust::transform(
+      rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(),
+      [=, block_starts = block_starts.data(),
+       batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) {
+        // what batch this block falls in
+        auto const batch_index_iter =
+            thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index);
+        auto const batch_index = std::distance(block_starts, batch_index_iter) - 1;
+        // local index within the block
+        int const local_block_index = block_index - block_starts[batch_index];
+        // the start row for this batch.
+        int const batch_row_start = batch_row_offsets[batch_index];
+        // the start row for this block
+        int const block_row_start = batch_row_start + (local_block_index * desired_window_height);
+        // the end row for this block
+        int const max_row = std::min(total_number_of_rows - 1,
+                                     batch_index + 1 > num_batches ?
+                                         std::numeric_limits<int>::max() :
+                                         static_cast<int>(batch_row_offsets[batch_index + 1]) - 1);
+        int const block_row_end = std::min(
+            batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row);
+
+        // stuff the block
+        return block_info{column_start, block_row_start, column_end, block_row_end,
+                          static_cast<int>(batch_index)};
+      });
+
+  return total_blocks;
+}
+
+/**
+ * @brief Determines what data should be operated on by each block for the incoming table.
+ *
+ * @tparam WindowCallback Callback that receives the start and end columns of windows
+ * @param column_sizes vector of the size of each column
+ * @param column_starts vector of the offset of each column
+ * @param first_row_batch_size size of the first row batch to limit max window size since a window
+ * is unable to span batches
+ * @param total_number_of_rows total number of rows in the table
+ * @param shmem_limit_per_block shared memory allowed per block
+ * @param f callback function called when building a window
+ */
+template <typename WindowCallback>
+void determine_windows(std::vector<size_type> const &column_sizes,
+                       std::vector<size_type> const &column_starts,
+                       size_type const first_row_batch_size, size_type const total_number_of_rows,
+                       size_type const &shmem_limit_per_block, WindowCallback f) {
   // block infos are organized with the windows going "down" the columns
   // this provides the most coalescing of memory access
   int current_window_width = 0;
   int current_window_start_col = 0;
 
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
-                          int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-    int i = 0;
-    while (i < total_number_of_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(detail::block_info{
-          start_col, current_window_start_row, end_col,
-          std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
-          current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
   // would be memory cache line sized access, but since other blocks will read/write the edges
   // this may not turn out to be overly important. For now, we will attempt to build a square
@@ -1183,12 +1372,10 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
   // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
   // trick is that it's in bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-  int const window_height = std::clamp(
-      util::round_up_safe<int>(
-          std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
-                   total_number_of_rows),
-          32),
-      1, row_batches[0].row_count);
+  int const window_height =
+      std::clamp(util::round_up_safe<int>(
+                     std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32),
+                 1, first_row_batch_size);
 
   auto calc_admin_data_size = [](int num_cols) -> size_type {
     // admin data is the column sizes and column start information.
@@ -1213,7 +1400,8 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
             calc_admin_data_size(col - current_window_start_col) >
         shmem_limit_per_block) {
       // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
+      f(current_window_start_col, col == 0 ? col : col - 1, window_height);
+
       row_size =
           detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
       row_size += col_size; // alignment required for shared memory window boundary to match
@@ -1228,12 +1416,24 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
 
   // build last set of blocks
   if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
+    f(current_window_start_col, (int)column_sizes.size() - 1, window_height);
   }
-
-  return block_infos;
 }
 
+struct row_size_functor {
+  size_type _fixed_width_size_per_row;
+  size_type _num_columns;
+  row_size_functor(size_t fixed_width_size_per_row, size_t num_columns)
+      : _fixed_width_size_per_row(fixed_width_size_per_row), _num_columns(num_columns){};
+
+  CUDA_DEVICE_CALLABLE
+  int operator()(int row_index) {
+    auto const bytes_needed =
+        _fixed_width_size_per_row + util::div_rounding_up_safe<size_type>(_num_columns, 8);
+    return detail::align_offset(bytes_needed, 8);
+  }
+};
+
 #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
 } // namespace detail
@@ -1242,9 +1442,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
                                                            rmm::cuda_stream_view stream,
                                                            rmm::mr::device_memory_resource *mr) {
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
   const size_type num_columns = tbl.num_columns();
   const size_type num_rows = tbl.num_rows();
 
@@ -1253,7 +1450,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   int total_shmem;
   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  // TODO: why?
+  // TODO: why is this needed. kernel fails to launch if all memory is requested.
   total_shmem -= 1024;
   int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
@@ -1277,150 +1474,113 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   std::vector<bitmask_type const *> input_nm;
   input_data.reserve(num_columns);
   input_nm.reserve(num_columns);
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
+  std::transform(
+      tbl.begin(), tbl.end(), std::back_inserter(input_data),
+      [](cudf::column_view const &c) -> int8_t const * { return c.template data<int8_t>(); });
+  std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_nm),
+                 [](auto c) { return c.null_mask(); });
 
   auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
   auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
 
-  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
   std::vector<size_type> column_starts; // offset of column inside a row including alignment
-  std::vector<column_view>
-      variable_width_columns; // list of the variable width columns in the table
-  row_sizes.reserve(num_rows);
-  row_offsets.reserve(num_rows);
   column_sizes.reserve(num_columns);
   column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
 
-  auto iter =
+  auto schema_column_iter =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                                       [&tbl](auto i) -> std::tuple<data_type, column_view const> {
                                         return std::make_tuple(tbl.column(i).type(), tbl.column(i));
                                       });
 
-  size_type fixed_width_size_per_row =
-      detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+      schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes);
 
   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
-  std::vector<detail::row_batch> row_batches;
-
-  uint64_t row_batch_size = 0;
-  uint64_t total_table_size = 0;
-  size_type row_batch_rows = 0;
-  uint64_t row_offset = 0;
+  // total encoded row size. This includes fixed-width data, validity, and variable-width data.
+  auto row_size_iter = cudf::detail::make_counting_transform_iterator(
+      0, detail::row_size_functor(fixed_width_size_per_row, num_columns));
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
   auto validity_size = num_bitmask_words(num_columns) * 4;
-  // thrust
-  for (int row = 0; row < num_rows; ++row) {
-    auto aligned_row_batch_size =
-        detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
-    row_sizes[row] = fixed_width_size_per_row;
-    // validity is byte aligned
-    row_sizes[row] += validity_size;
-    // variable width data is 8-byte aligned
-    row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned
-
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
-        (uint64_t)std::numeric_limits<size_type>::max()) {
-      // a new batch starts at the last 32-row boundary
-      row_batches.push_back(
-          detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size = 0;
-      row_batch_rows = row_batch_rows & 31;
-      row_offset = 0;
-      aligned_row_batch_size = 0;
-    }
-    row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
-    row_offsets.push_back(row_offset);
-    row_batch_size = aligned_row_batch_size + row_sizes[row];
-    row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
-    total_table_size += row_sizes[row];
-    row_batch_rows++;
-  }
-  if (row_batch_size > 0) {
-    row_batches.push_back(
-        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
-  }
 
-  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
+  auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr);
+  auto gpu_batch_row_boundaries =
+      make_device_uvector_async(batch_info.batch_row_boundaries, stream);
+
+  // the first batch always exists unless we were sent an empty table
+  auto const first_batch_size = batch_info.row_batches[0].row_count;
 
   std::vector<rmm::device_buffer> output_buffers;
   std::vector<int8_t *> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+  output_data.reserve(batch_info.row_batches.size());
+  for (uint i = 0; i < batch_info.row_batches.size(); ++i) {
+    rmm::device_buffer temp(batch_info.row_batches[i].num_bytes, stream, mr);
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
-  std::vector<detail::block_info> block_infos =
-      build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream);
+  int info_count = 0;
+  detail::determine_windows(
+      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block,
+      [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col,
+                                                        int const window_height) {
+        int i = detail::compute_block_counts(gpu_batch_row_boundaries, window_height, stream);
+        info_count += i;
+      });
+
+  // allocate space for blocks
+  device_uvector<detail::block_info> gpu_block_infos(info_count, stream);
+  int block_offset = 0;
+
+  detail::determine_windows(
+      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block,
+      [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset,
+       stream](int const start_col, int const end_col, int const window_height) {
+        block_offset += detail::build_blocks(
+            {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset},
+            gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream);
+      });
 
   // blast through the entire table and convert it
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
+  dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
   dim3 threads(256);
 
   detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
-      dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
+      num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(),
+      dev_col_sizes.data(), dev_col_starts.data(), batch_info.input_data_row_offsets.data(),
       reinterpret_cast<int8_t **>(dev_output_data.data()));
 
-  auto validity_block_infos =
-      build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+  auto validity_block_infos = detail::build_validity_block_infos(
+      num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches);
 
   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream);
   dim3 validity_blocks(
       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
   detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
-      column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
-      dev_input_nm.data());
+      num_rows, num_columns, shmem_limit_per_block, batch_info.input_data_row_offsets.data(),
+      dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data());
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
-  int offset_offset = 0;
   std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    // compute offsets for this row batch
-    std::vector<size_type> offset_vals;
-    offset_vals.reserve(row_batches[i].row_count + 1);
-    size_type cur_offset = 0;
-    offset_vals.push_back(cur_offset);
-    for (int row = 0; row < row_batches[i].row_count; ++row) {
-      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
-      offset_vals.push_back(cur_offset);
-    }
-    offset_offset += row_batches[i].row_count;
-
-    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
-    auto offsets = std::make_unique<column>(data_type{type_id::INT32},
-                                            (size_type)offset_vals.size(), dev_offsets.release());
-
-    auto data = std::make_unique<column>(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes,
-                                         std::move(output_buffers[i]));
-
-    ret.push_back(
-        cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0,
-                                rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
+  for (int batch = 0; batch < (int)batch_info.row_batches.size(); ++batch) {
+    auto const offset_count = batch_info.row_batches[batch].row_offsets.size();
+    auto offsets = std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_count,
+                                            batch_info.row_batches[batch].row_offsets.release());
+    auto data =
+        std::make_unique<column>(data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes,
+                                 std::move(output_buffers[batch]));
+
+    ret.push_back(cudf::make_lists_column(
+        batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), 0,
+        rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
   }
 
   return ret;
@@ -1437,7 +1597,8 @@ convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_str
 
   std::vector<cudf::data_type> schema;
   schema.resize(num_columns);
-  std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
+  std::transform(tbl.begin(), tbl.end(), schema.begin(),
+                 [](auto i) -> cudf::data_type { return i.type(); });
 
   if (detail::are_all_fixed_width(schema)) {
     std::vector<cudf::size_type> column_start;
@@ -1509,7 +1670,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   int total_shmem;
   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  // TODO why?
+  // TODO: why is this needed. kernel fails to launch if all memory is requested.
   total_shmem -= 1024;
   int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
@@ -1519,8 +1680,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
     return std::make_tuple(schema[i], nullptr);
   });
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-      iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {});
+  size_type fixed_width_size_per_row =
+      detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
 
   size_type validity_size = num_bitmask_words(num_columns) * 4;
 
@@ -1534,8 +1695,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
-
-  row_batches.push_back(detail::row_batch{child.size(), num_rows});
+  row_batches.push_back(
+      {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
 
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -1553,45 +1714,48 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
   auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
 
-  std::vector<detail::block_info> block_infos =
-      build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
-
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
+  // only ever get a single batch when going from rows, so boundaries
+  // are 0, num_rows
+  device_uvector<size_type> gpu_batch_row_boundaries(2, stream);
+
+  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(2), gpu_batch_row_boundaries.begin(),
+                    [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; });
+
+  int info_count = 0;
+  detail::determine_windows(column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block,
+                            [&gpu_batch_row_boundaries, &info_count, &stream](
+                                int const start_col, int const end_col, int const window_height) {
+                              info_count += detail::compute_block_counts(gpu_batch_row_boundaries,
+                                                                         window_height, stream);
+                            });
+
+  // allocate space for blocks
+  device_uvector<detail::block_info> gpu_block_infos(info_count, stream);
+
+  int block_offset = 0;
+  detail::determine_windows(
+      column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block,
+      [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset,
+       stream](int const start_col, int const end_col, int const window_height) {
+        block_offset += detail::build_blocks(
+            {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset},
+            gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream);
+      });
+
+  dim3 blocks(
+      util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
   detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
-      block_infos.size(), child.data<int8_t>());
+      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos,
+      child.data<int8_t>());
+
+  auto validity_block_infos =
+      detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream);
 
-  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-  auto const column_stride = [&]() {
-    if (desired_rows_and_columns > num_columns) {
-      // not many columns, group it into 64s and ship it off
-      return std::min(64, num_columns);
-    } else {
-      return util::round_down_safe(desired_rows_and_columns, 8);
-    }
-  }();
-  auto const row_stride = [&]() {
-    // we fit as much as we can, we know the column stride now, so calculate the row
-    return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
-    /*    if (desired_rows_and_columns > num_rows) {
-          return std::min(32, num_rows);
-        } else {
-          return util::round_down_safe(desired_rows_and_columns, 32);
-        }*/
-  }();
-  std::vector<detail::block_info> validity_block_infos;
-  for (int col = 0; col < num_columns; col += column_stride) {
-    for (int row = 0; row < num_rows; row += row_stride) {
-      validity_block_infos.emplace_back(
-          detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
-                             std::min(row + row_stride - 1, num_rows - 1)});
-    }
-  }
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
   dim3 validity_blocks(
       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
 
@@ -1599,8 +1763,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   detail::
       copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-          dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
-          validity_block_infos.size(), child.data<int8_t>());
+          dev_output_nm.data(), column_starts.back(), dev_validity_block_infos,
+          child.data<int8_t>());
 
   return std::make_unique<cudf::table>(std::move(output_columns));
 #else
@@ -1665,4 +1829,6 @@ std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
   }
 }
 
+} // namespace java
+
 } // namespace cudf

From 70e39cd58b26c5576140f9c95fbee13edeffff19 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 4 Nov 2021 00:02:24 +0000
Subject: [PATCH 31/80] fixing overflow issues with large tables

---
 java/src/main/native/src/row_conversion.cu | 202 +++++++++++----------
 1 file changed, 110 insertions(+), 92 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 87ab1ed49d8..c5bbed5274c 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -15,6 +15,8 @@
  */
 
 #include <algorithm>
+#include <cstdarg>
+#include <cstdint>
 #include <iostream>
 #include <iterator>
 #include <limits>
@@ -25,6 +27,8 @@
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
+#include "thrust/scan.h"
+
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 #include <cuda/barrier>
 #endif
@@ -50,7 +54,6 @@
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/scan.h>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;
@@ -336,7 +339,7 @@ struct block_info {
   int start_row;
   int end_col;
   int end_row;
-  int buffer_num;
+  int batch_number;
 
   __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
                                                     size_type const *const col_sizes) const {
@@ -369,7 +372,7 @@ struct row_batch {
  * @param input_data pointer to raw table data
  * @param col_sizes array of sizes for each element in a column - one per column
  * @param col_offsets offset into input data row for each column's start
- * @param row_offsets offset to a specific row in the input data
+ * @param row_offsets offset to a specific row in the output data
  * @param output_data pointer to output data
  *
  */
@@ -470,7 +473,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
-    auto const block_output_buffer = output_data[block.buffer_num];
+    auto const block_output_buffer = output_data[block.batch_number];
 
     // copy entire rows to final dest
     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
@@ -496,7 +499,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param row_offsets offset to a specific row in the input data
+ * @param row_offsets offset to a specific row in the output data
  * @param output_data pointer to output data, partitioned by data size
  * @param validity_offsets offset into input data row for validity data
  * @param block_infos information about the blocks of work
@@ -610,7 +613,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
     group.sync();
 
     auto const output_data_base =
-        output_data[block.buffer_num] + validity_offset + block.start_col / 8;
+        output_data[block.batch_number] + validity_offset + block.start_col / 8;
 
     // now async memcpy the shared memory out to the final destination
     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
@@ -1176,12 +1179,18 @@ constexpr size_type max_batch_size = std::numeric_limits<size_type>::max();
  *
  */
 struct batch_data {
+  device_uvector<size_type> batch_row_offsets;
   std::vector<size_type> batch_row_boundaries;
-  device_uvector<size_type> input_data_row_offsets;
   std::vector<row_batch> row_batches;
+};
 
-  batch_data(size_type num_input_offsets, rmm::cuda_stream_view stream)
-      : input_data_row_offsets(num_input_offsets, stream){};
+template <typename RowSize> struct row_size_functor {
+  RowSize _row_sizes;
+  size_type _num_rows;
+  row_size_functor(RowSize row_sizes) : _row_sizes(row_sizes){};
+
+  CUDA_DEVICE_CALLABLE
+  uint64_t operator()(int row_index) { return static_cast<uint64_t>(_row_sizes[row_index]); }
 };
 
 /**
@@ -1199,19 +1208,26 @@ struct batch_data {
 template <typename RowSize>
 batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream,
                          rmm::mr::device_memory_resource *mr) {
-  auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
-  auto const num_batches = util::div_rounding_up_safe(total_size, max_batch_size);
+  auto uint64_row_sizes =
+      cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes));
+  auto const total_size =
+      thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows);
+  auto const num_batches = static_cast<int32_t>(
+      util::div_rounding_up_safe(total_size, static_cast<uint64_t>(max_batch_size)));
   auto const num_offsets = num_batches + 1;
-  batch_data ret(num_rows + 1, stream);
+  std::vector<row_batch> row_batches;
+  std::vector<size_type> batch_row_boundaries;
+  device_uvector<size_type> batch_row_offsets(num_rows, stream);
 
   // at most max gpu memory / 2GB iterations.
-  ret.batch_row_boundaries.reserve(num_offsets);
-  ret.batch_row_boundaries.push_back(0);
+  batch_row_boundaries.reserve(num_offsets);
+  batch_row_boundaries.push_back(0);
   size_type last_row_end = 0;
-  device_uvector<size_type> cumulative_row_sizes(num_rows, stream);
-  thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
+  device_uvector<uint64_t> cumulative_row_sizes(num_rows, stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows,
                          cumulative_row_sizes.begin());
-  while ((int)ret.batch_row_boundaries.size() < num_offsets) {
+
+  while ((int)batch_row_boundaries.size() < num_offsets) {
     // find the next max_batch_size boundary
     size_type const row_end =
         ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(),
@@ -1220,6 +1236,9 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream
           cumulative_row_sizes.begin()) +
          last_row_end);
 
+    // build offset list for each row in this batch
+    auto const num_rows_in_batch = row_end - last_row_end;
+
     // build offset list for each row in this batch
     auto const num_entries = row_end - last_row_end + 1;
     device_uvector<size_type> output_batch_row_offsets(num_entries, stream, mr);
@@ -1232,44 +1251,44 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream
     thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded,
                            row_size_iter_bounded + num_entries, output_batch_row_offsets.begin());
 
-    ret.batch_row_boundaries.push_back(row_end);
-    auto const batch_bytes = output_batch_row_offsets.element(row_end, stream) -
-                             output_batch_row_offsets.element(last_row_end, stream);
-    auto const num_rows_in_batch = row_end - last_row_end;
-    ret.row_batches.push_back(
-        {batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)});
+    auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream);
+
+    // The output_batch_row_offsets vector is used as the offset column of the returned data. This
+    // needs to be individually allocated, but the kernel needs a contiguous array of offsets or
+    // more global lookups are necessary.
+    cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(),
+               num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice);
+
+    batch_row_boundaries.push_back(row_end);
+    row_batches.push_back({batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)});
+
     last_row_end = row_end;
   }
 
-  auto row_size_iter = cudf::detail::make_counting_transform_iterator(
-      0, [row_sizes, num_rows] __device__(auto i) { return (i < num_rows) ? row_sizes[i] : 0; });
-  thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter, row_size_iter + num_rows + 1,
-                         ret.input_data_row_offsets.begin());
-
-  return ret;
+  return {std::move(batch_row_offsets), batch_row_boundaries, std::move(row_batches)};
 }
 
 /**
  * @brief Computes the number of blocks necessary given a window height and batch offsets
  *
- * @param batch_row_offsets row offsets for each batch
+ * @param batch_row_boundaries row boundaries for each batch
  * @param desired_window_height height of each window in the table
  * @param stream stream to use
  * @return number of windows necessary
  */
-int compute_block_counts(device_span<size_type const> const &batch_row_offsets,
+int compute_block_counts(device_span<size_type const> const &batch_row_boundaries,
                          int desired_window_height, rmm::cuda_stream_view stream) {
-  size_type const num_batches = batch_row_offsets.size() - 1;
+  size_type const num_batches = batch_row_boundaries.size() - 1;
   device_uvector<size_type> num_blocks(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-      rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
-      [desired_window_height,
-       batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
-        return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] -
-                                                batch_row_offsets[batch_index],
-                                            desired_window_height);
-      });
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
+                    [desired_window_height,
+                     batch_row_boundaries =
+                         batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
+                      return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
+                                                              batch_row_boundaries[batch_index],
+                                                          desired_window_height);
+                    });
   return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
 }
 
@@ -1277,7 +1296,7 @@ int compute_block_counts(device_span<size_type const> const &batch_row_offsets,
  * @brief Builds the `block_info` structs for a given table.
  *
  * @param blocks span of blocks to populate
- * @param batch_row_offsets offsets to row batches
+ * @param batch_row_boundaries boundary to row batches
  * @param column_start starting column of the window
  * @param column_end ending column of the window
  * @param desired_window_height height of the window
@@ -1287,20 +1306,20 @@ int compute_block_counts(device_span<size_type const> const &batch_row_offsets,
  */
 size_type
 build_blocks(device_span<block_info> blocks,
-             device_uvector<size_type> const &batch_row_offsets, // comes from build_batches
+             device_uvector<size_type> const &batch_row_boundaries, // comes from build_batches
              int column_start, int column_end, int desired_window_height, int total_number_of_rows,
              rmm::cuda_stream_view stream) {
-  size_type const num_batches = batch_row_offsets.size() - 1;
+  size_type const num_batches = batch_row_boundaries.size() - 1;
   device_uvector<size_type> num_blocks(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-      rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
-      [desired_window_height,
-       batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
-        return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] -
-                                                batch_row_offsets[batch_index],
-                                            desired_window_height);
-      });
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
+                    [desired_window_height,
+                     batch_row_boundaries =
+                         batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
+                      return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
+                                                              batch_row_boundaries[batch_index],
+                                                          desired_window_height);
+                    });
 
   size_type const total_blocks =
       thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
@@ -1316,7 +1335,7 @@ build_blocks(device_span<block_info> blocks,
   thrust::transform(
       rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(),
       [=, block_starts = block_starts.data(),
-       batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) {
+       batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type block_index) {
         // what batch this block falls in
         auto const batch_index_iter =
             thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index);
@@ -1324,14 +1343,15 @@ build_blocks(device_span<block_info> blocks,
         // local index within the block
         int const local_block_index = block_index - block_starts[batch_index];
         // the start row for this batch.
-        int const batch_row_start = batch_row_offsets[batch_index];
+        int const batch_row_start = batch_row_boundaries[batch_index];
         // the start row for this block
         int const block_row_start = batch_row_start + (local_block_index * desired_window_height);
         // the end row for this block
-        int const max_row = std::min(total_number_of_rows - 1,
-                                     batch_index + 1 > num_batches ?
-                                         std::numeric_limits<int>::max() :
-                                         static_cast<int>(batch_row_offsets[batch_index + 1]) - 1);
+        int const max_row =
+            std::min(total_number_of_rows - 1,
+                     batch_index + 1 > num_batches ?
+                         std::numeric_limits<int>::max() :
+                         static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
         int const block_row_end = std::min(
             batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row);
 
@@ -1420,20 +1440,6 @@ void determine_windows(std::vector<size_type> const &column_sizes,
   }
 }
 
-struct row_size_functor {
-  size_type _fixed_width_size_per_row;
-  size_type _num_columns;
-  row_size_functor(size_t fixed_width_size_per_row, size_t num_columns)
-      : _fixed_width_size_per_row(fixed_width_size_per_row), _num_columns(num_columns){};
-
-  CUDA_DEVICE_CALLABLE
-  int operator()(int row_index) {
-    auto const bytes_needed =
-        _fixed_width_size_per_row + util::div_rounding_up_safe<size_type>(_num_columns, 8);
-    return detail::align_offset(bytes_needed, 8);
-  }
-};
-
 #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
 } // namespace detail
@@ -1502,7 +1508,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
   // total encoded row size. This includes fixed-width data, validity, and variable-width data.
   auto row_size_iter = cudf::detail::make_counting_transform_iterator(
-      0, detail::row_size_functor(fixed_width_size_per_row, num_columns));
+      0, [fixed_width_size_per_row, num_columns] __device__(auto i) {
+        auto const bytes_needed =
+            fixed_width_size_per_row + util::div_rounding_up_safe<size_type>(num_columns, 8);
+        return detail::align_offset(bytes_needed, 8);
+      });
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
@@ -1518,11 +1528,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   std::vector<rmm::device_buffer> output_buffers;
   std::vector<int8_t *> output_data;
   output_data.reserve(batch_info.row_batches.size());
-  for (uint i = 0; i < batch_info.row_batches.size(); ++i) {
-    rmm::device_buffer temp(batch_info.row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
-    output_buffers.push_back(std::move(temp));
-  }
+  output_buffers.reserve(batch_info.row_batches.size());
+  std::transform(batch_info.row_batches.begin(), batch_info.row_batches.end(),
+                 std::back_inserter(output_buffers), [&](auto const &batch) {
+                   return rmm::device_buffer(batch.num_bytes, stream, mr);
+                 });
+  std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data),
+                 [](auto &buf) { return static_cast<int8_t *>(buf.data()); });
+
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   int info_count = 0;
@@ -1551,11 +1564,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
   dim3 threads(256);
 
-  detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(),
-      dev_col_sizes.data(), dev_col_starts.data(), batch_info.input_data_row_offsets.data(),
-      reinterpret_cast<int8_t **>(dev_output_data.data()));
-
   auto validity_block_infos = detail::build_validity_block_infos(
       num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches);
 
@@ -1563,8 +1571,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   dim3 validity_blocks(
       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+
+  detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(),
+      dev_col_sizes.data(), dev_col_starts.data(),
+      batch_info.batch_row_offsets
+          .data(), // needs to be row offsets per batch, not overall JUST for output.
+      reinterpret_cast<int8_t **>(dev_output_data.data()));
+
   detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, batch_info.input_data_row_offsets.data(),
+      num_rows, num_columns, shmem_limit_per_block, batch_info.batch_row_offsets.data(),
       dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data());
 
   // split up the output buffer into multiple buffers based on row batch sizes
@@ -1693,11 +1709,6 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
 
-  // build the row_batches from the passed in list column
-  std::vector<detail::row_batch> row_batches;
-  row_batches.push_back(
-      {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
-
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<cudf::column>> output_columns;
   std::vector<int8_t *> output_data;
@@ -1711,6 +1722,11 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     output_columns.emplace_back(std::move(column));
   }
 
+  // build the row_batches from the passed in list column
+  std::vector<detail::row_batch> row_batches;
+  row_batches.push_back(
+      {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
+
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
   auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
 
@@ -1746,10 +1762,6 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   dim3 blocks(
       util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
-  detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos,
-      child.data<int8_t>());
 
   auto validity_block_infos =
       detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
@@ -1760,6 +1772,12 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
 
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+
+  detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos,
+      child.data<int8_t>());
+
   detail::
       copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),

From 64c8374aa4e21cd164a5011be3cc20d7ec377ac1 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 9 Nov 2021 03:50:24 +0000
Subject: [PATCH 32/80] fixing includes for java

---
 java/src/main/native/src/row_conversion.cu | 26 ++++++++++++----------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index c5bbed5274c..f9cb61f4ea1 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -23,16 +23,24 @@
 #include <tuple>
 
 #include <cooperative_groups.h>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
-#include "thrust/scan.h"
-
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 #include <cuda/barrier>
 #endif
 
+#include <thrust/scan.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/sequence.hpp>
@@ -40,20 +48,14 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
-#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
+
+#include "row_conversion.hpp"
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;

From f8ea2b1f767f1ce2885b71086c8936a1b13319a5 Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Mon, 15 Nov 2021 13:33:35 -0800
Subject: [PATCH 33/80] addressed review concerns

---
 java/src/main/java/ai/rapids/cudf/Table.java  | 26 ++++++----
 .../test/java/ai/rapids/cudf/TableTest.java   | 48 ++++++++-----------
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index eb61ec25d9a..7d9e5a19ed6 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -2651,6 +2651,23 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
+  /**
+   * For details about how this method functions refer to
+   * {@link #convertToRowsFixedWidthOptimized()}.
+   *
+   * The only thing different between this method and {@link #convertToRowsFixedWidthOptimized()}
+   * is that this can handle rougly 250M columns while {@link #convertToRowsFixedWidthOptimized()}
+   * can only handle columns less than 100
+   */
+  public ColumnVector[] convertToRows() {
+    long[] ptrs = convertToRows(nativeHandle);
+    ColumnVector[] ret = new ColumnVector[ptrs.length];
+    for (int i = 0; i < ptrs.length; i++) {
+      ret[i] = new ColumnVector(ptrs[i]);
+    }
+    return ret;
+  }
+
   /**
    * Convert this table of columns into a row major format that is useful for interacting with other
    * systems that do row major processing of the data. Currently only fixed-width column types are
@@ -2725,15 +2742,6 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
    * There are some limits on the size of a single row.  If the row is larger than 1KB this will
    * throw an exception.
    */
-  public ColumnVector[] convertToRows() {
-    long[] ptrs = convertToRows(nativeHandle);
-    ColumnVector[] ret = new ColumnVector[ptrs.length];
-    for (int i = 0; i < ptrs.length; i++) {
-      ret[i] = new ColumnVector(ptrs[i]);
-    }
-    return ret;
-  }
-
   public ColumnVector[] convertToRowsFixedWidthOptimized() {
     long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle);
     ColumnVector[] ret = new ColumnVector[ptrs.length];
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 623b444676f..6cc108030d1 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7223,26 +7223,22 @@ void fixedWidthRowsRoundTripWide() {
     IntStream.range(0, 10).forEach(i -> tb.decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d,
         9.5d, 0.9d, 7.23d, 2.8d, null));
     IntStream.range(0, 10).forEach(i -> tb.decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null));
-    try (Table t = tb.build()) {
-      ColumnVector[] rows = t.convertToRows();
+    try (Table origTable = tb.build()) {
+      ColumnVector[] rowMajorTable = origTable.convertToRows();
       try {
         // We didn't overflow
-        assert rows.length == 1;
-        ColumnVector cv = rows[0];
-        assert cv.getRowCount() == t.getRowCount();
-//        try (HostColumnVector hcv = cv.copyToHost()) {
-//          hcv.getChildColumnView(0).getDataBuffer().printBuffer(8);
-//        }
-
-        DType[] types = new DType[t.getNumberOfColumns()];
-        for (int i = 0; i < t.getNumberOfColumns(); i++) {
-          types[i] = t.getColumn(i).getType();
+        assert rowMajorTable.length == 1;
+        ColumnVector cv = rowMajorTable[0];
+        assert cv.getRowCount() == origTable.getRowCount();
+        DType[] types = new DType[origTable.getNumberOfColumns()];
+        for (int i = 0; i < origTable.getNumberOfColumns(); i++) {
+          types[i] = origTable.getColumn(i).getType();
         }
         try (Table backAgain = Table.convertFromRows(cv, types)) {
-          assertTablesAreEqual(t, backAgain);
+          assertTablesAreEqual(origTable, backAgain);
         }
       } finally {
-        for (ColumnVector cv : rows) {
+        for (ColumnVector cv : rowMajorTable) {
           cv.close();
         }
       }
@@ -7251,7 +7247,7 @@ void fixedWidthRowsRoundTripWide() {
 
   @Test
   void fixedWidthRowsRoundTrip() {
-    try (Table t = new TestBuilder()
+    try (Table origTable = new TestBuilder()
         .column(3l, 9l, 4l, 2l, 20l, null)
         .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)
         .column(5, 1, 0, 2, 7, null)
@@ -7261,25 +7257,21 @@ void fixedWidthRowsRoundTrip() {
         .decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)
         .decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null)
         .build()) {
-      ColumnVector[] rows = t.convertToRowsFixedWidthOptimized();
+      ColumnVector[] rowMajorTable = origTable.convertToRowsFixedWidthOptimized();
       try {
         // We didn't overflow
-        assert rows.length == 1;
-        ColumnVector cv = rows[0];
-        assert cv.getRowCount() == t.getRowCount();
-//        try (HostColumnVector hcv = cv.copyToHost()) {
-//          hcv.getChildColumnView(0).getDataBuffer().printBuffer(8);
-//        }
-
-        DType[] types = new DType[t.getNumberOfColumns()];
-        for (int i = 0; i < t.getNumberOfColumns(); i++) {
-          types[i] = t.getColumn(i).getType();
+        assert rowMajorTable.length == 1;
+        ColumnVector cv = rowMajorTable[0];
+        assert cv.getRowCount() == origTable.getRowCount();
+        DType[] types = new DType[origTable.getNumberOfColumns()];
+        for (int i = 0; i < origTable.getNumberOfColumns(); i++) {
+          types[i] = origTable.getColumn(i).getType();
         }
         try (Table backAgain = Table.convertFromRowsFixedWidthOptimized(cv, types)) {
-          assertTablesAreEqual(t, backAgain);
+          assertTablesAreEqual(origTable, backAgain);
         }
       } finally {
-        for (ColumnVector cv : rows) {
+        for (ColumnVector cv : rowMajorTable) {
           cv.close();
         }
       }

From c88472a95869619664c28f02dc321428523d87f9 Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Mon, 15 Nov 2021 14:33:09 -0800
Subject: [PATCH 34/80] removed TODOs and added note to javadocs

---
 java/src/main/java/ai/rapids/cudf/Table.java | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 7d9e5a19ed6..b39632e43e7 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -2754,13 +2754,14 @@ public ColumnVector[] convertToRowsFixedWidthOptimized() {
   /**
    * Convert a column of list of bytes that is formatted like the output from `convertToRows`
    * and convert it back to a table.
+   *
+   * NOTE: This method doesn't support nested types
+   *
    * @param vec the row data to process.
    * @param schema the types of each column.
    * @return the parsed table.
    */
   public static Table convertFromRows(ColumnView vec, DType ... schema) {
-    // TODO at some point we need a schema that support nesting so we can support nested types
-    // TODO we will need scale at some point very soon too
     int[] types = new int[schema.length];
     int[] scale = new int[schema.length];
     for (int i = 0; i < schema.length; i++) {
@@ -2774,13 +2775,14 @@ public static Table convertFromRows(ColumnView vec, DType ... schema) {
   /**
    * Convert a column of list of bytes that is formatted like the output from `convertToRows`
    * and convert it back to a table.
+   *
+   * NOTE: This method doesn't support nested types
+   *
    * @param vec the row data to process.
    * @param schema the types of each column.
    * @return the parsed table.
    */
   public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) {
-    // TODO at some point we need a schema that support nesting so we can support nested types
-    // TODO we will need scale at some point very soon too
     int[] types = new int[schema.length];
     int[] scale = new int[schema.length];
     for (int i = 0; i < schema.length; i++) {

From 00e58d7912e56b790b6e448b827b1cd481ab6500 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 7 Jun 2021 08:14:52 +0000
Subject: [PATCH 35/80] working on row and column conversions

---
 .../row_conversion/row_conversion.cpp         |  116 ++
 cpp/include/cudf/row_conversion.hpp           |   51 +
 cpp/src/row_conversion/row_conversion.cu      | 1106 +++++++++++++++++
 3 files changed, 1273 insertions(+)
 create mode 100644 cpp/benchmarks/row_conversion/row_conversion.cpp
 create mode 100644 cpp/include/cudf/row_conversion.hpp
 create mode 100644 cpp/src/row_conversion/row_conversion.cu

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
new file mode 100644
index 00000000000..c4edee91b3c
--- /dev/null
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/row_conversion.hpp>
+#include "cudf_test/column_utilities.hpp"
+
+class RowConversion : public cudf::benchmark {
+};
+
+static void BM_to_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         50,
+                                         row_count{n_rows});
+
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    total_bytes += cudf::size_of(t);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+//    auto rows = cudf::convert_to_rows(table->view());
+    auto new_rows = cudf::convert_to_rows2(table->view());
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+static void BM_from_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         256,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+                                           4,
+                                           row_count{n_rows});*/
+
+  std::vector<cudf::data_type> schema;
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    schema.push_back(t);
+    total_bytes += cudf::size_of(t);
+  }
+
+  auto rows = cudf::convert_to_rows(table->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto out = cudf::convert_from_rows(rows, schema);
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+  BENCHMARK_DEFINE_F(RowConversion, name)        \
+  (::benchmark::State & st) { BM_to_row(st); }   \
+  BENCHMARK_REGISTER_F(RowConversion, name)      \
+    ->RangeMultiplier(8)                         \
+    ->Ranges({{1 << 16, 1 << 24}})               \
+    ->UseManualTime()                            \
+    ->Unit(benchmark::kMillisecond);
+
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
+
+#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+  BENCHMARK_DEFINE_F(RowConversion, name)          \
+  (::benchmark::State & st) { BM_from_row(st); }   \
+  BENCHMARK_REGISTER_F(RowConversion, name)        \
+    ->RangeMultiplier(8)                           \
+    ->Ranges({{1 << 6, 1 << 22}})                  \
+    ->UseManualTime()                              \
+    ->Unit(benchmark::kMillisecond);
+
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
new file mode 100644
index 00000000000..f5e2225ad19
--- /dev/null
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
+  cudf::table_view const &tbl,
+  // TODO need something for validity
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(
+  cudf::table_view const &tbl,
+  // TODO need something for validity
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows(
+  cudf::lists_column_view const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
new file mode 100644
index 00000000000..fb5dc4cb38d
--- /dev/null
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <iterator>
+#include <limits>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/sequence.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cudf/row_conversion.hpp>
+#include "cudf/types.hpp"
+#include "rmm/device_buffer.hpp"
+#include "thrust/iterator/counting_iterator.h"
+#include "thrust/iterator/transform_iterator.h"
+
+namespace cudf {
+
+namespace detail {
+
+static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment)
+{
+  return (offset + alignment - 1) & ~(alignment - 1);
+}
+
+
+/**
+ * Copy a simple vector to device memory asynchronously. Be sure to read
+ * the data on the same stream as is used to copy it.
+ */
+template <typename T>
+std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource *mr)
+{
+  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
+  CUDA_TRY(cudaMemcpyAsync(
+    ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
+  return ret;
+}
+
+template <typename T>
+rmm::device_uvector<T> copy_to_dev_async2(
+  const std::vector<T> &input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  rmm::device_uvector<T> ret(input.size(), stream, mr);
+  CUDA_TRY(cudaMemcpyAsync(
+    ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
+  return ret;
+}
+
+__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
+                                            const cudf::size_type num_columns,
+                                            const cudf::size_type row_size,
+                                            const cudf::size_type *input_offset_in_row,
+                                            const cudf::size_type *num_bytes,
+                                            int8_t **output_data,
+                                            cudf::bitmask_type **output_nm,
+                                            const int8_t *input_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // For simplicity we will refer to this as a row_group
+
+  // In practice we have found writing more than 4 columns of data per thread
+  // results in performance loss. As such we are using a 2 dimensional
+  // kernel in terms of threads, but not in terms of blocks. Columns are
+  // controlled by the y dimension (there is no y dimension in blocks). Rows
+  // are controlled by the x dimension (there are multiple blocks in the x
+  // dimension).
+
+  cudf::size_type rows_per_group   = blockDim.x;
+  cudf::size_type row_group_start  = blockIdx.x;
+  cudf::size_type row_group_stride = gridDim.x;
+  cudf::size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+
+  extern __shared__ int8_t shared_data[];
+
+  // Because we are copying fixed width only data and we stride the rows
+  // this thread will always start copying from shared data in the same place
+  int8_t *row_tmp     = &shared_data[row_size * threadIdx.x];
+  int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+
+  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+       row_group_index += row_group_stride) {
+    // Step 1: Copy the data into shared memory
+    // We know row_size is always aligned with and a multiple of int64_t;
+    int64_t *long_shared      = reinterpret_cast<int64_t *>(shared_data);
+    const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
+
+    cudf::size_type shared_output_index  = threadIdx.x + (threadIdx.y * blockDim.x);
+    cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
+    cudf::size_type row_index_end        = ((row_group_index + 1) * rows_per_group);
+    if (row_index_end > num_rows) { row_index_end = num_rows; }
+    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    cudf::size_type shared_length     = row_size * num_rows_in_group;
+
+    cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
+
+    cudf::size_type start_input_index =
+      (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+
+    for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
+         shared_index += shared_output_stride) {
+      long_shared[shared_index] = long_input[start_input_index + shared_index];
+    }
+    // Wait for all of the data to be in shared memory
+    __syncthreads();
+
+    // Step 2 copy the data back out
+
+    // Within the row group there should be 1 thread for each row.  This is a
+    // requirement for launching the kernel
+    cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
+    // But we might not use all of the threads if the number of rows does not go
+    // evenly into the thread count. We don't want those threads to exit yet
+    // because we may need them to copy data in for the next row group.
+    uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
+    if (row_index < num_rows) {
+      cudf::size_type col_index_start  = threadIdx.y;
+      cudf::size_type col_index_stride = blockDim.y;
+      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+           col_index += col_index_stride) {
+        cudf::size_type col_size = num_bytes[col_index];
+        const int8_t *col_tmp    = &(row_tmp[input_offset_in_row[col_index]]);
+        int8_t *col_output       = output_data[col_index];
+        switch (col_size) {
+          case 1: {
+            col_output[row_index] = *col_tmp;
+            break;
+          }
+          case 2: {
+            int16_t *short_col_output   = reinterpret_cast<int16_t *>(col_output);
+            short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
+            break;
+          }
+          case 4: {
+            int32_t *int_col_output   = reinterpret_cast<int32_t *>(col_output);
+            int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
+            break;
+          }
+          case 8: {
+            int64_t *long_col_output   = reinterpret_cast<int64_t *>(col_output);
+            long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
+            break;
+          }
+          default: {
+            cudf::size_type output_offset = col_size * row_index;
+            // TODO this should just not be supported for fixed width columns, but just in case...
+            for (cudf::size_type b = 0; b < col_size; b++) {
+              col_output[b + output_offset] = col_tmp[b];
+            }
+            break;
+          }
+        }
+
+        cudf::bitmask_type *nm          = output_nm[col_index];
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
+        int predicate                   = *valid_byte & (1 << byte_bit_offset);
+        uint32_t bitmask                = __ballot_sync(active_mask, predicate);
+        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
+      }  // end column loop
+    }    // end row copy
+    // wait for the row_group to be totally copied before starting on the next row group
+    __syncthreads();
+  }
+}
+
+__global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
+                                              const cudf::size_type num_rows,
+                                              const cudf::size_type num_columns,
+                                              const cudf::size_type row_size,
+                                              const cudf::size_type *output_offset_in_row,
+                                              const cudf::size_type *num_bytes,
+                                              const int8_t **input_data,
+                                              const cudf::bitmask_type **input_nm,
+                                              int8_t *output_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // We do not support copying a subset of the columns in a row yet, so we don't
+  // currently support a row that is wider than shared memory.
+  // For simplicity we will refer to this as a row_group
+
+  // In practice we have found reading more than 4 columns of data per thread
+  // results in performance loss. As such we are using a 2 dimensional
+  // kernel in terms of threads, but not in terms of blocks. Columns are
+  // controlled by the y dimension (there is no y dimension in blocks). Rows
+  // are controlled by the x dimension (there are multiple blocks in the x
+  // dimension).
+
+  cudf::size_type rows_per_group   = blockDim.x;
+  cudf::size_type row_group_start  = blockIdx.x;
+  cudf::size_type row_group_stride = gridDim.x;
+  cudf::size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+
+  extern __shared__ int8_t shared_data[];
+
+  // Because we are copying fixed width only data and we stride the rows
+  // this thread will always start copying to shared data in the same place
+  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
+  int8_t *row_vld_tmp =
+    &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+
+  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+       row_group_index += row_group_stride) {
+    // Within the row group there should be 1 thread for each row.  This is a
+    // requirement for launching the kernel
+    cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
+    // But we might not use all of the threads if the number of rows does not go
+    // evenly into the thread count. We don't want those threads to exit yet
+    // because we may need them to copy data back out.
+    if (row_index < (start_row + num_rows)) {
+      cudf::size_type col_index_start  = threadIdx.y;
+      cudf::size_type col_index_stride = blockDim.y;
+      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+           col_index += col_index_stride) {
+        cudf::size_type col_size = num_bytes[col_index];
+        int8_t *col_tmp          = &(row_tmp[output_offset_in_row[col_index]]);
+        const int8_t *col_input  = input_data[col_index];
+        switch (col_size) {
+          case 1: {
+            *col_tmp = col_input[row_index];
+            break;
+          }
+          case 2: {
+            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(col_input);
+            *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
+            break;
+          }
+          case 4: {
+            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(col_input);
+            *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
+            break;
+          }
+          case 8: {
+            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(col_input);
+            *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
+            break;
+          }
+          default: {
+            cudf::size_type input_offset = col_size * row_index;
+            // TODO this should just not be supported for fixed width columns, but just in case...
+            for (cudf::size_type b = 0; b < col_size; b++) {
+              col_tmp[b] = col_input[b + input_offset];
+            }
+            break;
+          }
+        }
+        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
+        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
+        uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
+        int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
+        cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
+        // Now copy validity for the column
+        if (input_nm[col_index]) {
+          if (bit_is_set(input_nm[col_index], row_index)) {
+            atomicOr_block(valid_int, 1 << int_bit_offset);
+          } else {
+            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
+          }
+        } else {
+          // It is valid so just set the bit
+          atomicOr_block(valid_int, 1 << int_bit_offset);
+        }
+      }  // end column loop
+    }    // end row copy
+    // wait for the row_group to be totally copied into shared memory
+    __syncthreads();
+
+    // Step 2: Copy the data back out
+    // We know row_size is always aligned with and a multiple of int64_t;
+    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
+    int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
+
+    cudf::size_type shared_input_index  = threadIdx.x + (threadIdx.y * blockDim.x);
+    cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
+    cudf::size_type row_index_end       = ((row_group_index + 1) * rows_per_group);
+    if (row_index_end > num_rows) { row_index_end = num_rows; }
+    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    cudf::size_type shared_length     = row_size * num_rows_in_group;
+
+    cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
+
+    cudf::size_type start_output_index =
+      (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+
+    for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
+         shared_index += shared_input_stride) {
+      long_output[start_output_index + shared_index] = long_shared[shared_index];
+    }
+    __syncthreads();
+    // Go for the next round
+  }
+}
+
+struct block_info {
+  int start_col;
+  int start_row;
+  int end_col;
+  int end_row;
+  int buffer_num;
+};
+
+/**
+ * @brief copy data from cudf columns into x format, which is row-based
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param input_data pointer to raw table data
+ * @param input_nm pointer to validity data
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param row_offsets offset to a specific row in the input data
+ * @param output_data pointer to output data
+ * 
+ */
+__global__ void copy_from_columns(const cudf::size_type num_rows,
+                                  const cudf::size_type num_columns,
+                                  const int8_t **input_data,
+                                  const cudf::bitmask_type **input_nm,
+                                  const cudf::size_type *col_sizes,
+                                  const cudf::size_type *col_offsets,
+                                  const block_info *block_infos,
+                                  const uint64_t *row_offsets,
+                                  int8_t **output_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  auto block = block_infos[blockIdx.x];
+  extern __shared__ int8_t shared_data[];
+  uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
+  uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
+
+    printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x);
+
+  // each thread is responsible for every threadcount rows of data.
+  // the data is copies into shared memory in the final layout.
+  auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
+  auto const validity_offset = col_offsets[num_columns];
+  for (int col=block.start_col; col<=block.end_col; ++col) {
+    /*if (!col_is_variable) */{
+      uint64_t col_offset = 0;
+      cudf::size_type col_size = col_sizes[col];
+      auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
+        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * row];
+        switch (col_size) {
+          case 1: {
+            *shmem_dest = input_data[col][row];
+            break;
+          }
+          case 2: {
+            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
+            *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
+            break;
+          }
+          case 4: {
+            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
+            break;
+          }
+          case 8: {
+            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
+            *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
+            break;
+          }
+          default: {
+            cudf::size_type input_offset = col_size * row;
+            // TODO this should just not be supported for fixed width columns, but just in case...
+            for (cudf::size_type b = 0; b < col_size; b++) {
+              shmem_dest[b] = input_data[col][b + input_offset];
+            }
+            break;
+          }
+        }
+
+        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
+        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
+        // we do this directly in the final location because the entire row may not
+        // fit in shared memory and may require many blocks to process it entirely
+        int8_t *valid_byte              = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
+        cudf::size_type byte_bit_offset = col % 8;
+        uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
+        int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
+        cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
+        // Now copy validity for the column
+        if (input_nm[col]) {
+          if (bit_is_set(input_nm[col], row)) {
+            atomicOr_block(valid_int, 1 << int_bit_offset);
+          } else {
+            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
+          }
+        } else {
+          // It is valid so just set the bit
+          atomicOr_block(valid_int, 1 << int_bit_offset);
+        }
+      } // end row
+
+      col_offset += col_sizes[col] * (block.end_row - block.start_row);
+    }
+  } // end col
+
+  // wait for the data to be totally copied into shared memory
+  __syncthreads();
+
+  // Step 2: Copy the data from shared memory to final destination
+  // each block is potentially a slice of the table, so no assumptions
+  // can be made about alignments. We do know that the alignment in shared
+  // memory matches the final destination alignment. Also note that
+  // we are not writing to entirely contiguous destinations as each
+  // row in shared memory may not be an entire row of the destination.
+  //
+  auto const thread_start_offset = threadIdx.x * 8;
+  auto const thread_stride = gridDim.x * 8;
+  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) {
+    auto const output_row_num = src_offset / shmem_row_size;
+    auto const row_offset = row_offsets[block.start_row + output_row_num];
+    auto const col_offset = src_offset % shmem_row_size;
+    int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset];
+    int8_t *input_ptr = &shared_data[src_offset];
+    // the first part and last part of the row is unaligned data copy. This is copied a single byte
+    // at a time.
+    if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
+      // first part of a row, copy single bytes
+      auto const num_single_bytes = 8 - dest_shim_offset;
+      for (auto i=0; i<num_single_bytes; ++i) {
+        output_ptr[i] = input_ptr[i + dest_shim_offset];
+      }
+    } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) {
+      // last part of a row, copy single bytes
+      auto const num_single_bytes = dest_shim_offset;
+      for (auto i=0; i<num_single_bytes; ++i) {
+        output_ptr[i] = input_ptr[i + dest_shim_offset];
+      }
+    } else {
+      // copy 8 bytes aligned
+      const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_ptr);
+      *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
+    }
+  }
+}
+
+/**
+ * Calculate the dimensions of the kernel for fixed width only columns.
+ * @param [in] num_columns the number of columns being copied.
+ * @param [in] num_rows the number of rows being copied.
+ * @param [in] size_per_row the size each row takes up when padded.
+ * @param [out] blocks the size of the blocks for the kernel
+ * @param [out] threads the size of the threads for the kernel
+ * @return the size in bytes of shared memory needed for each block.
+ */
+static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
+                                        const cudf::size_type num_rows,
+                                        const cudf::size_type size_per_row,
+                                        dim3 &blocks,
+                                        dim3 &threads)
+{
+  // We have found speed degrades when a thread handles more than 4 columns.
+  // Each block is 2 dimensional. The y dimension indicates the columns.
+  // We limit this to 32 threads in the y dimension so we can still
+  // have at least 32 threads in the x dimension (1 warp) which should
+  // result in better coalescing of memory operations. We also
+  // want to guarantee that we are processing a multiple of 32 threads
+  // in the x dimension because we use atomic operations at the block
+  // level when writing validity data out to main memory, and that would
+  // need to change if we split a word of validity data between blocks.
+  int y_block_size = (num_columns + 3) / 4;
+  if (y_block_size > 32) { y_block_size = 32; }
+  int x_possible_block_size = 1024 / y_block_size;
+  // 48KB is the default setting for shared memory per block according to the cuda tutorials
+  // If someone configures the GPU to only have 16 KB this might not work.
+  int max_shared_size = 48 * 1024;
+  int max_block_size  = max_shared_size / size_per_row;
+  // If we don't have enough shared memory there is no point in having more threads
+  // per block that will just sit idle
+  max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
+  // Make sure that the x dimension is a multiple of 32 this not only helps
+  // coalesce memory access it also lets us do a ballot sync for validity to write
+  // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
+  // dimension is associated with one or more warps, that should correspond to the validity
+  // words directly.
+  int block_size = (max_block_size / 32) * 32;
+  CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
+
+  int num_blocks = (num_rows + block_size - 1) / block_size;
+  if (num_blocks < 1) {
+    num_blocks = 1;
+  } else if (num_blocks > 10240) {
+    // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
+    // but in practice haveing too many can cause some overhead that I don't totally
+    // understand. Playing around with this haveing as little as 600 blocks appears
+    // to be able to saturate memory on V100, so this is an order of magnitude higher
+    // to try and future proof this a bit.
+    num_blocks = 10240;
+  }
+  blocks.x  = num_blocks;
+  blocks.y  = 1;
+  blocks.z  = 1;
+  threads.x = block_size;
+  threads.y = y_block_size;
+  threads.z = 1;
+  return size_per_row * block_size;
+}
+
+/**
+ * When converting to rows it is possible that the size of the table was too big to fit
+ * in a single column. This creates an output column for a subset of the rows in a table
+ * going from start row and containing the next num_rows.  Most of the parameters passed
+ * into this function are common between runs and should be calculated once.
+ */
+static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
+  const cudf::size_type start_row,
+  const cudf::size_type num_rows,
+  const cudf::size_type num_columns,
+  const cudf::size_type size_per_row,
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
+  std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
+  std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
+  const cudf::scalar &zero,
+  const cudf::scalar &scalar_size_per_row,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  int64_t total_allocation = size_per_row * num_rows;
+  // We made a mistake in the split somehow
+  CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
+
+  // Allocate and set the offsets row for the byte array
+  std::unique_ptr<cudf::column> offsets =
+    cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
+
+  std::unique_ptr<cudf::column> data =
+    cudf::make_numeric_column(cudf::data_type(cudf::type_id::INT8),
+                              static_cast<cudf::size_type>(total_allocation),
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+
+  dim3 blocks;
+  dim3 threads;
+  int shared_size =
+    detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+
+  copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+    start_row,
+    num_rows,
+    num_columns,
+    size_per_row,
+    column_start->data(),
+    column_size->data(),
+    input_data->data(),
+    input_nm->data(),
+    data->mutable_view().data<int8_t>());
+
+  return cudf::make_lists_column(num_rows,
+                                 std::move(offsets),
+                                 std::move(data),
+                                 0,
+                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+                                 stream,
+                                 mr);
+}
+
+static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); }
+
+static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema)
+{
+  return std::all_of(
+    schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
+}
+
+/**
+ * Given a set of fixed width columns, calculate how the data will be laid out in memory.
+ * @param [in] schema the types of columns that need to be laid out.
+ * @param [out] column_start the byte offset where each column starts in the row.
+ * @param [out] column_size the size in bytes of the data for each columns in the row.
+ * @return the size in bytes each row needs.
+ */
+static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
+                                                 std::vector<cudf::size_type> &column_start,
+                                                 std::vector<cudf::size_type> &column_size)
+{
+  // We guarantee that the start of each column is 64-bit aligned so anything can go
+  // there, but to make the code simple we will still do an alignment for it.
+  int32_t at_offset = 0;
+  for (auto col = schema.begin(); col < schema.end(); col++) {
+    cudf::size_type s = cudf::size_of(*col);
+    column_size.emplace_back(s);
+    std::size_t allocation_needed = s;
+    std::size_t alignment_needed  = allocation_needed;  // They are the same for fixed width types
+    at_offset                     = align_offset(at_offset, alignment_needed);
+    column_start.emplace_back(at_offset);
+    at_offset += allocation_needed;
+  }
+
+  // Now we need to add in space for validity
+  // Eventually we can think about nullable vs not nullable, but for now we will just always add it
+  // in
+  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  // validity comes at the end and is byte aligned so we can pack more in.
+  at_offset += validity_bytes_needed;
+  // Now we need to pad the end so all rows are 64 bit aligned
+  return align_offset(at_offset, 8);  // 8 bytes (64 bits)
+}
+
+}  // namespace detail
+
+//#define DEBUG
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr)
+{
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough
+  // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes.
+  constexpr int max_window_height = 1024;
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows    = tbl.num_rows();
+
+  #if defined(DEBUG)
+  auto pretty_print = [](uint64_t i) {
+    if (i > (1 * 1024 * 1024 * 1024)) {
+      printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
+    } else if (i > (1 * 1024 * 1024)) {
+      printf("%.2f MB", i / float(1 * 1024 * 1024));
+    } else if (i > (1 * 1024)) {
+      printf("%.2f KB", float(i / 1024));
+    } else {
+      printf("%lu Bytes", i);
+    }
+  };
+  #endif
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int shmem_limit_per_block;
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  // break up the work into blocks, which are a starting and ending row/col #.
+  // this window size is calculated based on the shared memory size available
+  // we want a single block to fill up the entire shared memory space available
+  // for the transpose-like conversion.
+
+  // There are two different processes going on here. The GPU conversion of the data
+  // and the writing of the data into the list of byte columns that are a maximum of
+  // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
+  // this limitation because the column must own the data inside and as a result it must be
+  // a distinct allocation for that column. Copying the data into these final buffers would
+  // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
+  // The windows are broken at the boundaries of specific rows based on the row sizes up
+  // to that point. These are row batches and they are decided first before building the
+  // windows so the windows can be properly cut around them.
+
+  std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
+  std::vector<uint64_t> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> column_sizes;  // byte size of each column
+  std::vector<size_type> column_starts; // offset of column inside a row including alignment
+  std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
+  row_sizes.reserve(num_rows);
+  row_offsets.reserve(num_rows);
+  column_sizes.reserve(num_columns);
+  column_starts.reserve(num_columns+1); // we add a final offset for validity data start
+
+  size_type fixed_width_size_per_row = 0;
+  for (int col = 0; col < num_columns; ++col) {
+    auto cv = tbl.column(col);
+    auto col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (nested_type) { variable_width_columns.push_back(cv);}
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed  = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row                  = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+  
+  // When building the columns to return, we have to be mindful of the offset limit in cudf.
+  // It is 32-bit and these data columns are capable of surpassing that easily. The data should
+  // not be cut off exactly at the limit though due to the validity buffers. The most efficient
+  // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+  // we keep track of the cut points for the validity, which we call row batches. If the row
+  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit.
+  // Note that this boundary is for our book-keeping with column pointers and not anything
+  // that the kernel needs to worry about. We cut the output at convienient boundaries
+  // when assembling the outgoing data stream.
+  struct row_batch {
+    size_type num_bytes;
+    size_type row_count;
+  };
+  std::vector<row_batch> row_batches;
+
+  auto calculate_variable_width_row_data_size = [](int const row) {
+    // each level of variable-width data will add an offset/length
+    // uint64 of data. The first of which is inside the fixed-width
+    // data itself and needs to be aligned based on what is around
+    // that data. This is handled above with the fixed-width calculations
+    // for that reason. We may still need to add more of these offset/length
+    // combinations if the nesting is deeper than one level as these
+    // will be included in the variable-width data blob at the end of the
+    // row.
+    return 0;
+/*      auto c = variable_width_columns[col];
+        while (true) {
+          auto col_offsets   = c.child(0).data<size_type>();
+          auto col_data_size = size_of(c.child(1).type());
+          std::size_t alignment_needed  = col_data_size;
+    
+        row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+        if (c.num_children() == 0) {
+          break;
+        }
+        c = c.child(1);
+      }
+*/
+  };
+
+  uint64_t row_batch_size   = 0;
+  uint64_t total_table_size = 0;
+  size_type row_batch_rows = 0;
+  uint64_t row_offset = 0;
+
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate
+  // the size of each row's variable-width data as well.
+  for (int row = 0; row < num_rows; ++row) {
+    row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row);
+    if (row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+      // a new batch starts at the last 32-row boundary
+      row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size = 0;
+      row_batch_rows = row_batch_rows & 31;
+      row_offset = 0;
+    }
+    row_offset                  = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+    row_offsets.push_back(row_offset);
+    row_batch_size += row_sizes[row];
+    row_offset += row_sizes[row];
+    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+    total_table_size += row_sizes[row];
+    row_batch_rows++;
+  }
+  if (row_batch_size > 0) {
+    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+  }
+
+  #if defined(DEBUG)
+  printf("%lu batches:\n", row_batches.size());
+  for (auto i = 0; i < (int)row_batches.size(); ++i) {
+    printf("%d: %d rows, ", i, row_batches[i].row_count);
+    pretty_print(row_batches[i].num_bytes);
+    printf("\n");
+  }
+  #endif
+
+  std::vector<detail::block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_size      = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+    int i = 0;
+    while (i < num_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(
+        detail::block_info{start_col,
+                   current_window_start_row,
+                   start_col + end_col,
+                   std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (int col = 0; col < num_columns; ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed  = col_size;  // They are the same for fixed width types
+    auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size;
+
+    if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row
+      current_window_start_col = col;
+    } else {
+      row_size = row_size_with_this_col;
+    }
+  }
+
+  auto validity_offset = detail::align_offset(column_starts.back(), 4);
+  column_starts.push_back(validity_offset);
+  
+  // build last set of blocks
+  if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); }
+
+  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things
+  std::vector<const int8_t *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
+
+  #if defined(DEBUG)
+  printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared mem(");
+  pretty_print(fixed_width_size_per_row);
+  printf("/row, %d columns, %d rows, ", num_columns, num_rows);
+  pretty_print(total_table_size);
+  printf(" total):\n");
+  #endif
+
+  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts   = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  std::vector<rmm::device_buffer> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i=0; i<row_batches.size(); ++i) {
+    output_data.push_back(rmm::device_buffer(row_batches[i].num_bytes, stream, mr));
+  }
+  auto dev_output_data   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  // blast through the entire table and convert it
+  dim3 blocks;
+  dim3 threads;
+  blocks.x  = block_infos.size();
+  blocks.y  = 0;
+  blocks.z  = 0;
+  threads.x = 1024;
+  threads.y = 0;
+  threads.z = 0;
+  detail::copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
+                                                                                num_columns,
+                                                                                dev_input_data.data(),
+                                                                                dev_input_nm.data(),
+                                                                                dev_col_sizes.data(),
+                                                                                dev_col_starts.data(),
+                                                                                dev_block_infos.data(),
+                                                                                dev_row_offsets.data(),
+                                                                                reinterpret_cast<int8_t **>(dev_output_data.data()));
+
+  // split up the output buffer into multiple buffers based on row batch sizes
+  // and create list of byte columns
+  int offset_offset = 0;
+  std::vector<std::unique_ptr<cudf::column>> ret;
+  for (uint i=0; i<row_batches.size(); ++i) {
+  
+    // compute offsets for this row batch
+    std::vector<size_type> offset_vals;
+    offset_vals.reserve(row_batches[i].row_count + 1);
+    size_type cur_offset = 0;
+    offset_vals.push_back(cur_offset);
+    for (int row=0; row<row_batches[i].row_count; ++row) {
+      cur_offset += row_sizes[row + offset_offset];
+      offset_vals.push_back(cur_offset);
+    }
+    offset_offset += row_batches[i].row_count;
+
+    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto offsets =
+      std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
+
+    auto data =
+      std::make_unique<column>(data_type{cudf::type_id::INT8},
+                                row_batches[i].num_bytes,
+                                std::move(output_data[i]));
+
+    ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
+      std::move(offsets),
+      std::move(data),
+      0,
+      rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+      stream,
+      mr));
+  }
+  
+  return ret;
+}
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource *mr)
+{
+  const cudf::size_type num_columns = tbl.num_columns();
+
+  std::vector<cudf::data_type> schema;
+  schema.resize(num_columns);
+  std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
+
+  if (detail::are_all_fixed_width(schema)) {
+    std::vector<cudf::size_type> column_start;
+    std::vector<cudf::size_type> column_size;
+
+    int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
+    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+
+    int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
+    // Make the number of rows per batch a multiple of 32 so we don't have to worry about
+    // splitting validity at a specific row offset.  This might change in the future.
+    max_rows_per_batch = (max_rows_per_batch / 32) * 32;
+
+    cudf::size_type num_rows = tbl.num_rows();
+
+    // Get the pointers to the input columnar data ready
+    std::vector<const int8_t *> input_data;
+    std::vector<cudf::bitmask_type const *> input_nm;
+    for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
+      cudf::column_view cv = tbl.column(column_number);
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+    auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr);
+    auto dev_input_nm   = detail::copy_to_dev_async(input_nm, stream, mr);
+
+    using ScalarType = cudf::scalar_type_t<cudf::size_type>;
+    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    zero->set_valid(true, stream);
+    static_cast<ScalarType *>(zero.get())->set_value(0, stream);
+
+    auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    step->set_valid(true, stream);
+    static_cast<ScalarType *>(step.get())
+      ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
+
+    std::vector<std::unique_ptr<cudf::column>> ret;
+    for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
+      cudf::size_type row_count = num_rows - row_start;
+      row_count                 = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
+      ret.emplace_back(detail::fixed_width_convert_to_rows(row_start,
+                                                           row_count,
+                                                           num_columns,
+                                                           size_per_row,
+                                                           dev_column_start,
+                                                           dev_column_size,
+                                                           dev_input_data,
+                                                           dev_input_nm,
+                                                           *zero,
+                                                           *step,
+                                                           stream,
+                                                           mr));
+    }
+
+    return ret;
+  } else {
+    CUDF_FAIL("Only fixed width types are currently supported");
+  }
+}
+
+std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
+                                               std::vector<cudf::data_type> const &schema,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource *mr)
+{
+  // verify that the types are what we expect
+  cudf::column_view child = input.child();
+  cudf::type_id list_type = child.type().id();
+  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  cudf::size_type num_columns = schema.size();
+
+  if (detail::are_all_fixed_width(schema)) {
+    std::vector<cudf::size_type> column_start;
+    std::vector<cudf::size_type> column_size;
+
+    cudf::size_type num_rows = input.parent().size();
+    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+
+    // Ideally we would check that the offsets are all the same, etc. but for now
+    // this is probably fine
+    CUDF_EXPECTS(size_per_row * num_rows == child.size(),
+                 "The layout of the data appears to be off");
+    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+
+    // Allocate the columns we are going to write into
+    std::vector<std::unique_ptr<cudf::column>> output_columns;
+    std::vector<int8_t *> output_data;
+    std::vector<cudf::bitmask_type *> output_nm;
+    for (cudf::size_type i = 0; i < num_columns; i++) {
+      auto column = cudf::make_fixed_width_column(
+        schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
+      auto mut = column->mutable_view();
+      output_data.emplace_back(mut.data<int8_t>());
+      output_nm.emplace_back(mut.null_mask());
+      output_columns.emplace_back(std::move(column));
+    }
+
+    auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr);
+    auto dev_output_nm   = detail::copy_to_dev_async(output_nm, stream, mr);
+
+    dim3 blocks;
+    dim3 threads;
+    int shared_size =
+      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+
+    detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+      num_rows,
+      num_columns,
+      size_per_row,
+      dev_column_start->data(),
+      dev_column_size->data(),
+      dev_output_data->data(),
+      dev_output_nm->data(),
+      child.data<int8_t>());
+
+    return std::make_unique<cudf::table>(std::move(output_columns));
+  } else {
+    CUDF_FAIL("Only fixed width types are currently supported");
+  }
+}
+
+std::unique_ptr<cudf::table> convert_from_rows(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
+
+  //    for (uint i=0; i<input.size(); ++i) {
+  cudf::lists_column_view lcv = input[0]->view();
+  auto ret                    = convert_from_rows(lcv, schema, stream, mr);
+
+  return ret;
+  //    }
+}
+
+}  // namespace cudf

From b9f42cd2701b8933aae7156a34c9bd3ad83b1f05 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 10 Jun 2021 17:53:09 +0000
Subject: [PATCH 36/80] fixing kernel launch and updating

---
 .../row_conversion/row_conversion.cpp         |   9 +-
 cpp/src/row_conversion/row_conversion.cu      | 105 +++++++++++++-----
 2 files changed, 83 insertions(+), 31 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index c4edee91b3c..9fa05c408e5 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -28,7 +28,7 @@ class RowConversion : public cudf::benchmark {
 static void BM_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
+/*  auto const table = create_random_table({cudf::type_id::INT8,
                                           cudf::type_id::INT32,
                                           cudf::type_id::INT16,
                                           cudf::type_id::INT64,
@@ -38,7 +38,10 @@ static void BM_to_row(benchmark::State& state)
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
                                          50,
-                                         row_count{n_rows});
+                                         row_count{n_rows});*/
+  auto const table = create_random_table({cudf::type_id::INT32},
+  64,
+  row_count{n_rows});
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -98,7 +101,7 @@ static void BM_from_row(benchmark::State& state)
   (::benchmark::State & st) { BM_to_row(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)      \
     ->RangeMultiplier(8)                         \
-    ->Ranges({{1 << 16, 1 << 24}})               \
+    ->Ranges({{1 << 6, 1 << 20}})               \
     ->UseManualTime()                            \
     ->Unit(benchmark::kMillisecond);
 
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index fb5dc4cb38d..994233a0700 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <iostream>
 #include <iterator>
 #include <limits>
@@ -347,14 +348,14 @@ struct block_info {
  * @param output_data pointer to output data
  * 
  */
-__global__ void copy_from_columns(const cudf::size_type num_rows,
-                                  const cudf::size_type num_columns,
+__global__ void copy_from_columns(const size_type num_rows,
+                                  const size_type num_columns,
                                   const int8_t **input_data,
-                                  const cudf::bitmask_type **input_nm,
-                                  const cudf::size_type *col_sizes,
-                                  const cudf::size_type *col_offsets,
+                                  const bitmask_type **input_nm,
+                                  const size_type *col_sizes,
+                                  const size_type *col_offsets,
                                   const block_info *block_infos,
-                                  const uint64_t *row_offsets,
+                                  const size_type *row_offsets,
                                   int8_t **output_data)
 {
   // We are going to copy the data in two passes.
@@ -365,47 +366,92 @@ __global__ void copy_from_columns(const cudf::size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
+  bool debug_print = false;
+  
+  if (debug_print) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("Column Info:\n");
+    for (int i=0; i<num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n", i, input_data[i], col_sizes[i], col_offsets[i]);
+    }
+    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+/*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+      printf("%d: %d\n", i, row_offsets[i]);
+    }*/
+    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+  }
   auto block = block_infos[blockIdx.x];
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
   uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
-
-    printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x);
-
+  if (debug_print) {
+    printf("outputting to offset %lu\n", output_start_offset);
+    printf("dest shim offset is %d\n", dest_shim_offset);
+    printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
+  }
   // each thread is responsible for every threadcount rows of data.
   // the data is copies into shared memory in the final layout.
   auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
+  if (debug_print) {
+    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]);
+    printf("shmem row size %d\n", shmem_row_size);
+    printf("validity offset is %d\n", validity_offset);
+    printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row);
+  }
   for (int col=block.start_col; col<=block.end_col; ++col) {
     /*if (!col_is_variable) */{
       uint64_t col_offset = 0;
       cudf::size_type col_size = col_sizes[col];
       auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      if (debug_print) {
+        printf("dest col offset %d\n", dest_col_offset);
+      }
       for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
-        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * row];
+        if (debug_print) {
+          printf("shmem row %d at offset %d\n", row - block.start_row, (row - block.start_row) * shmem_row_size);
+        }
+        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
         switch (col_size) {
           case 1: {
-            *shmem_dest = input_data[col][row];
+            if (debug_print) {
+              printf("%p <- byte %d\n", shmem_dest, input_data[col][row]);
+            }
+              *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
             const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
+            if (debug_print) {
+              printf("%p <- short %d\n", shmem_dest, short_col_input[row]);
+            }
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
             const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            if (debug_print) {
+              printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]);
+            }
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
             const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
+            if (debug_print) {
+              printf("%p <- long %lu\n", shmem_dest, long_col_input[row]);
+            }
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
-            // TODO this should just not be supported for fixed width columns, but just in case...
+            if (debug_print) {
+                printf("byte for byte copy due to size %d\n", col_size);
+                printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]);
+              }
+              // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
             }
@@ -676,6 +722,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
+  #if defined(DEBUG)
+  size_t free, total;
+  cudaMemGetInfo( &free, &total );
+  printf("%lu/%lu Memory", free, total);
+  #endif
+
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
   // we want a single block to fill up the entire shared memory space available
@@ -692,7 +744,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // windows so the windows can be properly cut around them.
 
   std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
-  std::vector<uint64_t> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> row_offsets; // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
   std::vector<size_type> column_starts; // offset of column inside a row including alignment
   std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
@@ -821,7 +873,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
       block_infos.emplace_back(
         detail::block_info{start_col,
                    current_window_start_row,
-                   start_col + end_col,
+                   end_col,
                    std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
 
       i += window_height;
@@ -889,23 +941,20 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
   auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
-  std::vector<rmm::device_buffer> output_data;
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
   output_data.reserve(row_batches.size());
   for (uint i=0; i<row_batches.size(); ++i) {
-    output_data.push_back(rmm::device_buffer(row_batches[i].num_bytes, stream, mr));
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_output_data   = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks;
-  dim3 threads;
-  blocks.x  = block_infos.size();
-  blocks.y  = 0;
-  blocks.z  = 0;
-  threads.x = 1024;
-  threads.y = 0;
-  threads.z = 0;
-  detail::copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
+  dim3 blocks(block_infos.size());
+  dim3 threads(1024);
+  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
                                                                                 num_columns,
                                                                                 dev_input_data.data(),
                                                                                 dev_input_nm.data(),
@@ -932,14 +981,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);  
     auto offsets =
       std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
     auto data =
       std::make_unique<column>(data_type{cudf::type_id::INT8},
                                 row_batches[i].num_bytes,
-                                std::move(output_data[i]));
+                                std::move(output_buffers[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
       std::move(offsets),

From 6a267abad1dc539217f63fc41fa24b1788504955 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 16 Jun 2021 19:25:57 +0000
Subject: [PATCH 37/80] Updates and bug fixing

---
 .../row_conversion/row_conversion.cpp         |  76 ++-
 cpp/src/row_conversion/row_conversion.cu      | 498 ++++++++++++------
 cpp/tests/row_conversion/row_conversion.cpp   | 110 ++++
 3 files changed, 488 insertions(+), 196 deletions(-)
 create mode 100644 cpp/tests/row_conversion/row_conversion.cpp

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 9fa05c408e5..e1228c9df21 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -25,10 +25,43 @@
 class RowConversion : public cudf::benchmark {
 };
 
-static void BM_to_row(benchmark::State& state)
+static void BM_old_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-/*  auto const table = create_random_table({cudf::type_id::INT8,
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         212,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+    64,
+    row_count{n_rows});*/
+
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    total_bytes += cudf::size_of(t);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto rows = cudf::convert_to_rows(table->view());
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+static void BM_new_to_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
                                           cudf::type_id::INT32,
                                           cudf::type_id::INT16,
                                           cudf::type_id::INT64,
@@ -37,11 +70,11 @@ static void BM_to_row(benchmark::State& state)
                                           cudf::type_id::UINT16,
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
-                                         50,
-                                         row_count{n_rows});*/
-  auto const table = create_random_table({cudf::type_id::INT32},
-  64,
-  row_count{n_rows});
+                                         212,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+    64,
+    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -52,14 +85,13 @@ static void BM_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-//    auto rows = cudf::convert_to_rows(table->view());
     auto new_rows = cudf::convert_to_rows2(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-static void BM_from_row(benchmark::State& state)
+/*static void BM_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -73,9 +105,6 @@ static void BM_from_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          256,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-                                           4,
-                                           row_count{n_rows});*/
 
   std::vector<cudf::data_type> schema;
   cudf::size_type total_bytes = 0;
@@ -94,18 +123,19 @@ static void BM_from_row(benchmark::State& state)
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
-  BENCHMARK_DEFINE_F(RowConversion, name)        \
-  (::benchmark::State & st) { BM_to_row(st); }   \
-  BENCHMARK_REGISTER_F(RowConversion, name)      \
-    ->RangeMultiplier(8)                         \
-    ->Ranges({{1 << 6, 1 << 20}})               \
-    ->UseManualTime()                            \
+}*/
+
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
@@ -116,4 +146,4 @@ TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
+//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 994233a0700..92ba075c316 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -44,7 +44,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-
 /**
  * Copy a simple vector to device memory asynchronously. Be sure to read
  * the data on the same stream as is used to copy it.
@@ -61,10 +60,9 @@ std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &
 }
 
 template <typename T>
-rmm::device_uvector<T> copy_to_dev_async2(
-  const std::vector<T> &input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
+rmm::device_uvector<T> copy_to_dev_async2(const std::vector<T> &input,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource *mr)
 {
   rmm::device_uvector<T> ret(input.size(), stream, mr);
   CUDA_TRY(cudaMemcpyAsync(
@@ -346,7 +344,7 @@ struct block_info {
  * @param block_infos information about the blocks of work
  * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data
- * 
+ *
  */
 __global__ void copy_from_columns(const size_type num_rows,
                                   const size_type num_columns,
@@ -366,92 +364,119 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false;
-  
+  bool debug_print = false;  // blockIdx.x == 70 && threadIdx.x == 448;
+
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
     printf("Column Info:\n");
-    for (int i=0; i<num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n", i, input_data[i], col_sizes[i], col_offsets[i]);
+    for (int i = 0; i < num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n",
+             i,
+             input_data[i],
+             col_sizes[i],
+             col_offsets[i]);
     }
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-/*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-      printf("%d: %d\n", i, row_offsets[i]);
-    }*/
+    /*    printf("Row Offsets:\n");
+        for (int i=0; i<num_rows; ++i) {
+          printf("%d: %d\n", i, row_offsets[i]);
+        }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
-  auto block = block_infos[blockIdx.x];
+  auto block               = block_infos[blockIdx.x];
+  auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
-  uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
+  uint8_t const dest_shim_offset =
+    reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) &
+    7;  // offset for alignment shim in order to match shared memory with final dest
   if (debug_print) {
     printf("outputting to offset %lu\n", output_start_offset);
     printf("dest shim offset is %d\n", dest_shim_offset);
     printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
+    printf("my block is %d,%d -> %d,%d - buffer %d\n",
+           block.start_col,
+           block.start_row,
+           block.end_col,
+           block.end_row,
+           block.buffer_num);
   }
   // each thread is responsible for every threadcount rows of data.
   // the data is copies into shared memory in the final layout.
-  auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
+  auto const real_bytes_in_row =
+    col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
+  auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
+                                           8);  // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
   if (debug_print) {
-    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]);
+    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n",
+           block.end_col,
+           col_offsets[block.end_col],
+           block.end_col,
+           col_sizes[block.end_col],
+           block.start_col,
+           col_offsets[block.start_col]);
     printf("shmem row size %d\n", shmem_row_size);
     printf("validity offset is %d\n", validity_offset);
-    printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row);
+    printf("starting at %d,%d and going to %d, %d\n",
+           block.start_col,
+           block.start_row,
+           block.end_col,
+           block.end_row);
   }
-  for (int col=block.start_col; col<=block.end_col; ++col) {
-    /*if (!col_is_variable) */{
-      uint64_t col_offset = 0;
+  for (int col = block.start_col; col <= block.end_col; ++col) {
+    /*if (!col_is_variable) */ {
+      uint64_t col_offset      = 0;
       cudf::size_type col_size = col_sizes[col];
-      auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
-      if (debug_print) {
-        printf("dest col offset %d\n", dest_col_offset);
-      }
-      for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
+      auto const dest_col_offset =
+        col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
+      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) {
         if (debug_print) {
-          printf("shmem row %d at offset %d\n", row - block.start_row, (row - block.start_row) * shmem_row_size);
+          printf("shmem row %d(%d) at offset %d(%d)\n",
+                 row - block.start_row,
+                 row,
+                 (row - block.start_row) * shmem_row_size,
+                 row * shmem_row_size);
         }
-        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
+        int8_t *shmem_dest =
+          &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
         switch (col_size) {
           case 1: {
-            if (debug_print) {
-              printf("%p <- byte %d\n", shmem_dest, input_data[col][row]);
-            }
-              *shmem_dest = input_data[col][row];
+            if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); }
+            *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
-            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
-            if (debug_print) {
-              printf("%p <- short %d\n", shmem_dest, short_col_input[row]);
-            }
+            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_data[col]);
+            if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); }
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
-            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_data[col]);
             if (debug_print) {
-              printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]);
+              printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]);
             }
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
-            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
-            if (debug_print) {
-              printf("%p <- long %lu\n", shmem_dest, long_col_input[row]);
-            }
+            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_data[col]);
+            if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); }
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
             if (debug_print) {
-                printf("byte for byte copy due to size %d\n", col_size);
-                printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]);
-              }
-              // TODO this should just not be supported for fixed width columns, but just in case...
+              printf("byte for byte copy due to size %d of column %d\n", col_size, col);
+              printf("%p <- input_data[%d] which is %d\n",
+                     shmem_dest,
+                     input_offset,
+                     input_data[col][input_offset]);
+            }
+            // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
             }
@@ -463,11 +488,13 @@ __global__ void copy_from_columns(const size_type num_rows,
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
-        int8_t *valid_byte              = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
+        int8_t *valid_byte =
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
         cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
+        if (debug_print) { printf("Outputting validity to %p\n", valid_byte); }
         // Now copy validity for the column
         if (input_nm[col]) {
           if (bit_is_set(input_nm[col], row)) {
@@ -479,11 +506,11 @@ __global__ void copy_from_columns(const size_type num_rows,
           // It is valid so just set the bit
           atomicOr_block(valid_int, 1 << int_bit_offset);
         }
-      } // end row
+      }  // end row
 
-      col_offset += col_sizes[col] * (block.end_row - block.start_row);
+      col_offset += col_sizes[col] * rows_in_block;
     }
-  } // end col
+  }  // end col
 
   // wait for the data to be totally copied into shared memory
   __syncthreads();
@@ -496,30 +523,75 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride = gridDim.x * 8;
-  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) {
+  auto const thread_stride       = gridDim.x * 8;
+  if (debug_print) {
+    printf("writing final data from %d to %d at stride %d\n",
+           thread_start_offset,
+           shmem_row_size * rows_in_block,
+           thread_stride);
+    printf("rows in block %d\n", rows_in_block);
+  }
+  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block;
+       src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
-    auto const row_offset = row_offsets[block.start_row + output_row_num];
-    auto const col_offset = src_offset % shmem_row_size;
-    int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset];
-    int8_t *input_ptr = &shared_data[src_offset];
-    // the first part and last part of the row is unaligned data copy. This is copied a single byte
-    // at a time.
-    if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
-      // first part of a row, copy single bytes
+    auto const row_offset     = row_offsets[block.start_row + output_row_num];
+    auto const col_offset     = src_offset % shmem_row_size;
+    int8_t *output_ptr        = &output_data[block.buffer_num][row_offset + col_offset];
+    int8_t *input_ptr         = &shared_data[src_offset];
+
+    // three cases to worry about here
+    // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front
+    // 2) last 8-byte part of a large row - some bytes of pad at the end
+    // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front
+    // AND potentially pad at the rear
+
+    // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily.
+    // 1st case is when we're at some even multiple of shmem_row_size offset.
+    // 2nd case is when offset + 8 is some even multiple of shmem_row_size.
+    // must be an 8 byte copy
+
+    // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize?
+    if (real_bytes_in_row + dest_shim_offset <= 8) {
+      // case 3, we want to copy real_bytes_in_row bytes
+      auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf("case 3 - %d single byte final write %p -> %p\n",
+                 num_single_bytes,
+                 &input_ptr[i + dest_shim_offset],
+                 &output_ptr[i]);
+        }
+        output_ptr[i] = input_ptr[i + dest_shim_offset];
+      }
+    } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
+      // first byte with leading pad
       auto const num_single_bytes = 8 - dest_shim_offset;
-      for (auto i=0; i<num_single_bytes; ++i) {
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf(
+            "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]);
+        }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
-    } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) {
-      // last part of a row, copy single bytes
-      auto const num_single_bytes = dest_shim_offset;
-      for (auto i=0; i<num_single_bytes; ++i) {
+    } else if ((src_offset + 8) % shmem_row_size == 0 &&
+               (real_bytes_in_row + dest_shim_offset) % 8 > 0) {
+      // last bytes of a row
+      auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8;
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf("single trailing byte final write %p -> %p\n",
+                 &input_ptr[i + dest_shim_offset],
+                 &output_ptr[i]);
+        }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
     } else {
       // copy 8 bytes aligned
-      const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_ptr);
+      const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_ptr);
+      if (debug_print) {
+        printf(
+          "long final write %p -> %p\n", long_col_input, reinterpret_cast<int64_t *>(output_ptr));
+      }
       *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
     }
   }
@@ -696,13 +768,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
                                                             rmm::cuda_stream_view stream,
                                                             rmm::mr::device_memory_resource *mr)
 {
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough
-  // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes.
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
   constexpr int max_window_height = 1024;
-  const size_type num_columns = tbl.num_columns();
-  const size_type num_rows    = tbl.num_rows();
+  const size_type num_columns     = tbl.num_columns();
+  const size_type num_rows        = tbl.num_rows();
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   auto pretty_print = [](uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
@@ -714,7 +787,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
       printf("%lu Bytes", i);
     }
   };
-  #endif
+#endif
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
@@ -722,11 +795,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   size_t free, total;
-  cudaMemGetInfo( &free, &total );
-  printf("%lu/%lu Memory", free, total);
-  #endif
+  cudaMemGetInfo(&free, &total);
+  printf("%lu/%lu Memory\n", free, total);
+#endif
 
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
@@ -743,45 +816,46 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
-  std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
+  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
-  std::vector<size_type> column_starts; // offset of column inside a row including alignment
-  std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
+  std::vector<size_type> column_starts;  // offset of column inside a row including alignment
+  std::vector<column_view>
+    variable_width_columns;  // list of the variable width columns in the table
   row_sizes.reserve(num_rows);
   row_offsets.reserve(num_rows);
   column_sizes.reserve(num_columns);
-  column_starts.reserve(num_columns+1); // we add a final offset for validity data start
+  column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
   size_type fixed_width_size_per_row = 0;
   for (int col = 0; col < num_columns; ++col) {
-    auto cv = tbl.column(col);
-    auto col_type = cv.type();
+    auto cv          = tbl.column(col);
+    auto col_type    = cv.type();
     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
-    if (nested_type) { variable_width_columns.push_back(cv);}
+    if (nested_type) { variable_width_columns.push_back(cv); }
 
     // a list or string column will write a single uint64
     // of data here for offset/length
     auto col_size = nested_type ? 8 : size_of(col_type);
 
     // align size for this type
-    std::size_t const alignment_needed  = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row                  = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
     column_starts.push_back(fixed_width_size_per_row);
     column_sizes.push_back(col_size);
     fixed_width_size_per_row += col_size;
   }
-  
+
   // When building the columns to return, we have to be mindful of the offset limit in cudf.
   // It is 32-bit and these data columns are capable of surpassing that easily. The data should
   // not be cut off exactly at the limit though due to the validity buffers. The most efficient
   // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
   // we keep track of the cut points for the validity, which we call row batches. If the row
-  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit.
-  // Note that this boundary is for our book-keeping with column pointers and not anything
-  // that the kernel needs to worry about. We cut the output at convienient boundaries
-  // when assembling the outgoing data stream.
+  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+  // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+  // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+  // the outgoing data stream.
   struct row_batch {
     size_type num_bytes;
     size_type row_count;
@@ -798,71 +872,90 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     // will be included in the variable-width data blob at the end of the
     // row.
     return 0;
-/*      auto c = variable_width_columns[col];
-        while (true) {
-          auto col_offsets   = c.child(0).data<size_type>();
-          auto col_data_size = size_of(c.child(1).type());
-          std::size_t alignment_needed  = col_data_size;
-    
-        row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-        if (c.num_children() == 0) {
-          break;
-        }
-        c = c.child(1);
-      }
-*/
+    /*      auto c = variable_width_columns[col];
+            while (true) {
+              auto col_offsets   = c.child(0).data<size_type>();
+              auto col_data_size = size_of(c.child(1).type());
+              std::size_t alignment_needed  = col_data_size;
+
+            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+            if (c.num_children() == 0) {
+              break;
+            }
+            c = c.child(1);
+          }
+    */
   };
 
   uint64_t row_batch_size   = 0;
   uint64_t total_table_size = 0;
-  size_type row_batch_rows = 0;
-  uint64_t row_offset = 0;
+  size_type row_batch_rows  = 0;
+  uint64_t row_offset       = 0;
+
+  auto calculate_validity_size = [](int const num_cols) {
+    // Now we need to add in space for validity
+    // Eventually we can think about nullable vs not nullable, but for now we will just always add
+    // it in
+    return (num_cols + 7) / 8;
+  };
 
-  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate
-  // the size of each row's variable-width data as well.
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
+  // calculate the size of each row's variable-width data and validity as well.
   for (int row = 0; row < num_rows; ++row) {
-    row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row);
-    if (row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+    auto aligned_row_batch_size =
+      detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
+    row_sizes[row] = fixed_width_size_per_row;
+    // validity is byte aligned
+    row_sizes[row] += calculate_validity_size(num_columns);
+    // variable width data is 8-byte aligned
+    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
+                     calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
+
+    if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
-      row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size = 0;
-      row_batch_rows = row_batch_rows & 31;
-      row_offset = 0;
+      row_batches.push_back(
+        row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size         = 0;
+      row_batch_rows         = row_batch_rows & 31;
+      row_offset             = 0;
+      aligned_row_batch_size = 0;
     }
-    row_offset                  = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+    row_offset = detail::align_offset(row_offset, 8);  // rows are 8 byte aligned
     row_offsets.push_back(row_offset);
-    row_batch_size += row_sizes[row];
+    row_batch_size = aligned_row_batch_size + row_sizes[row];
     row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+    total_table_size = detail::align_offset(total_table_size, 8);  // rows are 8 byte aligned
     total_table_size += row_sizes[row];
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
+  printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
     printf("%d: %d rows, ", i, row_batches[i].row_count);
     pretty_print(row_batches[i].num_bytes);
     printf("\n");
   }
-  #endif
+#endif
 
   std::vector<detail::block_info> block_infos;
 
   // block infos are organized with the windows going "down" the columns
   // this provides the most coalescing of memory access
-  int current_window_size      = 0;
+  int current_window_width     = 0;
   int current_window_start_col = 0;
 
   // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) {
+  auto build_blocks = [&block_infos, &row_batches, num_rows](
+                        int const start_col, int const end_col, int const desired_window_height) {
     int current_window_start_row = 0;
     int current_window_row_batch = 0;
-    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-    int i = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int i                        = 0;
     while (i < num_rows) {
       if (rows_left_in_batch == 0) {
         current_window_row_batch++;
@@ -872,9 +965,10 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
       block_infos.emplace_back(
         detail::block_info{start_col,
-                   current_window_start_row,
-                   end_col,
-                   std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
+                           current_window_start_row,
+                           end_col,
+                           std::min(current_window_start_row + window_height - 1, num_rows - 1),
+                           current_window_row_batch});
 
       i += window_height;
       current_window_start_row += window_height;
@@ -882,7 +976,17 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   };
 
-  int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+  int const window_height =
+    std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+#if defined(DEBUG)
+  printf(
+    "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
+    "%d\n",
+    max_window_height,
+    num_rows,
+    row_batches[0].row_count,
+    window_height);
+#endif
 
   int row_size = 0;
 
@@ -891,32 +995,74 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     auto const col_size = column_sizes[col];
 
     // align size for this type
-    std::size_t alignment_needed  = col_size;  // They are the same for fixed width types
-    auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size;
+    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
+    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col  = row_size_aligned + col_size;
 
     if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+#if defined(DEBUG)
+      printf(
+        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "for shared mem size %d\n",
+        row_size_with_this_col * window_height,
+        col,
+        row_size * window_height,
+        current_window_start_col,
+        col - 1,
+        window_height,
+        row_size_with_this_col,
+        row_size,
+        row_size_aligned,
+        shmem_limit_per_block);
+#endif
       // too large, close this window, generate vertical blocks and restart
       build_blocks(current_window_start_col, col - 1, window_height);
-      row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row
+      row_size =
+        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+#if defined(DEBUG)
+      printf(
+        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "or %d)\n",
+        row_size,
+        col_size,
+        row_size + col_size,
+        column_starts[col - 1],
+        column_sizes[col - 1],
+        column_starts[col - 1] + column_sizes[col - 1]);
+#endif
+      row_size += col_size;  // alignment required for shared memory window boundary to match
+                             // alignment of output row
       current_window_start_col = col;
+      current_window_width     = 0;
     } else {
       row_size = row_size_with_this_col;
+      current_window_width++;
     }
   }
 
-  auto validity_offset = detail::align_offset(column_starts.back(), 4);
+#if defined(DEBUG)
+  printf("validity offset will be %d + %d = %d\n",
+         column_starts.back(),
+         column_sizes.back(),
+         column_starts.back() + column_sizes.back());
+#endif
+  auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4);
   column_starts.push_back(validity_offset);
-  
+
   // build last set of blocks
-  if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); }
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
+  }
 
-  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things
+  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while
+  // calculating other things
   std::vector<const int8_t *> input_data;
   std::vector<bitmask_type const *> input_nm;
   for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv = tbl.column(column_number);
+    column_view cv      = tbl.column(column_number);
     auto const col_type = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
     if (!nested_type) {
       input_data.emplace_back(cv.data<int8_t>());
@@ -924,81 +1070,87 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  #if defined(DEBUG)
-  printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row);
+#if defined(DEBUG)
+  printf("%lu windows for %d columns, %d rows to fit in ",
+         block_infos.size(),
+         block_infos[0].end_col - block_infos[0].start_col + 1,
+         block_infos[0].end_row - block_infos[0].start_row);
   pretty_print(shmem_limit_per_block);
   printf(" shared mem(");
   pretty_print(fixed_width_size_per_row);
   printf("/row, %d columns, %d rows, ", num_columns, num_rows);
   pretty_print(total_table_size);
   printf(" total):\n");
-  #endif
+#endif
 
   auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
   auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
   auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts   = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
-  auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
   std::vector<rmm::device_buffer> output_buffers;
   std::vector<int8_t *> output_data;
   output_data.reserve(row_batches.size());
-  for (uint i=0; i<row_batches.size(); ++i) {
+  for (uint i = 0; i < row_batches.size(); ++i) {
     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data   = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(1024);
-  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
-                                                                                num_columns,
-                                                                                dev_input_data.data(),
-                                                                                dev_input_nm.data(),
-                                                                                dev_col_sizes.data(),
-                                                                                dev_col_starts.data(),
-                                                                                dev_block_infos.data(),
-                                                                                dev_row_offsets.data(),
-                                                                                reinterpret_cast<int8_t **>(dev_output_data.data()));
+  dim3 threads(std::min((uint64_t)1024, total_table_size / 8));
+#if defined(DEBUG)
+  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared memory\n");
+#endif
+  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+    num_rows,
+    num_columns,
+    dev_input_data.data(),
+    dev_input_nm.data(),
+    dev_col_sizes.data(),
+    dev_col_starts.data(),
+    dev_block_infos.data(),
+    dev_row_offsets.data(),
+    reinterpret_cast<int8_t **>(dev_output_data.data()));
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
   int offset_offset = 0;
   std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i=0; i<row_batches.size(); ++i) {
-  
+  for (uint i = 0; i < row_batches.size(); ++i) {
     // compute offsets for this row batch
     std::vector<size_type> offset_vals;
     offset_vals.reserve(row_batches[i].row_count + 1);
     size_type cur_offset = 0;
     offset_vals.push_back(cur_offset);
-    for (int row=0; row<row_batches[i].row_count; ++row) {
-      cur_offset += row_sizes[row + offset_offset];
+    for (int row = 0; row < row_batches[i].row_count; ++row) {
+      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
       offset_vals.push_back(cur_offset);
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);  
-    auto offsets =
-      std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
+    auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto offsets     = std::make_unique<column>(
+      data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
-    auto data =
-      std::make_unique<column>(data_type{cudf::type_id::INT8},
-                                row_batches[i].num_bytes,
-                                std::move(output_buffers[i]));
+    auto data = std::make_unique<column>(
+      data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
-      std::move(offsets),
-      std::move(data),
-      0,
-      rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-      stream,
-      mr));
+                                          std::move(offsets),
+                                          std::move(data),
+                                          0,
+                                          rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+                                          stream,
+                                          mr));
   }
-  
+
   return ret;
 }
 
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
new file mode 100644
index 00000000000..c02f83ad1d5
--- /dev/null
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <cudf/row_conversion.hpp>
+
+struct ColumnToRowTests : public cudf::test::BaseFixture {
+};
+
+TEST_F(ColumnToRowTests, Single)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Simple)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Tall)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Wide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, SingleByteWide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows2(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}

From 170a771d84347c0dd30ec9d9aa8eaf8041279ccf Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 21 Jun 2021 18:17:45 +0000
Subject: [PATCH 38/80] Updating windows to be generated in a square way so we
 can have more data to write out as 8-byte writes from shared memory. Shuffled
 some of the copy to GPU code up so it can start the copy sooner and hopefully
 won't force stalls. Some bug fixes.

---
 .../row_conversion/row_conversion.cpp         | 15 ++-
 cpp/src/row_conversion/row_conversion.cu      | 96 +++++++++++--------
 2 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index e1228c9df21..d6b195433cf 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -125,7 +125,7 @@ static void BM_new_to_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }*/
 
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
   BENCHMARK_DEFINE_F(RowConversion, name)           \
   (::benchmark::State & st) { f(st); }              \
   BENCHMARK_REGISTER_F(RowConversion, name)         \
@@ -134,8 +134,17 @@ static void BM_new_to_row(benchmark::State& state)
     ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
+    ->Unit(benchmark::kMillisecond);
+
+OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 92ba075c316..3f221e2f716 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -364,7 +364,7 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false;  // blockIdx.x == 70 && threadIdx.x == 448;
+  constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -383,6 +383,7 @@ __global__ void copy_from_columns(const size_type num_rows,
         }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
+  //else { return; }
   auto block               = block_infos[blockIdx.x];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
@@ -416,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            col_sizes[block.end_col],
            block.start_col,
            col_offsets[block.start_col]);
-    printf("shmem row size %d\n", shmem_row_size);
+    printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row);
     printf("validity offset is %d\n", validity_offset);
     printf("starting at %d,%d and going to %d, %d\n",
            block.start_col,
@@ -524,6 +525,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   //
   auto const thread_start_offset = threadIdx.x * 8;
   auto const thread_stride       = gridDim.x * 8;
+  auto const end_offset = shmem_row_size * rows_in_block;
+
   if (debug_print) {
     printf("writing final data from %d to %d at stride %d\n",
            thread_start_offset,
@@ -531,7 +534,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            thread_stride);
     printf("rows in block %d\n", rows_in_block);
   }
-  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block;
+  for (auto src_offset = thread_start_offset; src_offset < end_offset;
        src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
     auto const row_offset     = row_offsets[block.start_row + output_row_num];
@@ -771,7 +774,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
   // Potential optimization for window sizes.
-  constexpr int max_window_height = 1024;
   const size_type num_columns     = tbl.num_columns();
   const size_type num_rows        = tbl.num_rows();
 
@@ -816,6 +818,25 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
+  // Get the pointers to the input columnar data ready
+  std::vector<const int8_t *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  input_data.reserve(num_columns);
+  input_nm.reserve(num_columns);
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv      = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
+
+  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
@@ -847,6 +868,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     fixed_width_size_per_row += col_size;
   }
 
+  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
+
   // When building the columns to return, we have to be mindful of the offset limit in cudf.
   // It is 32-bit and these data columns are capable of surpassing that easily. The data should
   // not be cut off exactly at the limit though due to the validity buffers. The most efficient
@@ -901,17 +925,18 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
+  auto validity_size = calculate_validity_size(num_columns);
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
     row_sizes[row] = fixed_width_size_per_row;
     // validity is byte aligned
-    row_sizes[row] += calculate_validity_size(num_columns);
+    row_sizes[row] += validity_size;
     // variable width data is 8-byte aligned
     row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
                      calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
 
-    if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
       row_batches.push_back(
         row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
@@ -932,7 +957,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-#if defined(DEBUG)
+  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  #if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
@@ -942,6 +969,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
 #endif
 
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i = 0; i < row_batches.size(); ++i) {
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
+  }
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+
   std::vector<detail::block_info> block_infos;
 
   // block infos are organized with the windows going "down" the columns
@@ -976,8 +1013,13 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   };
 
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized
+  // access, but since other blocks will read/write the edges this may not turn out to be overly important.
+  // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size.
+  // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are
+  // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns.
   int const window_height =
-    std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+    std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count);
 #if defined(DEBUG)
   printf(
     "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
@@ -998,20 +1040,21 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     std::size_t alignment_needed = col_size;  // They are the same for fixed width types
     auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
     auto row_size_with_this_col  = row_size_aligned + col_size;
+    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
 
-    if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
 #if defined(DEBUG)
       printf(
         "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
         "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
         "for shared mem size %d\n",
-        row_size_with_this_col * window_height,
+        row_size_with_end_pad * window_height,
         col,
         row_size * window_height,
         current_window_start_col,
         col - 1,
         window_height,
-        row_size_with_this_col,
+        row_size_with_end_pad,
         row_size,
         row_size_aligned,
         shmem_limit_per_block);
@@ -1055,20 +1098,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
   }
 
-  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while
-  // calculating other things
-  std::vector<const int8_t *> input_data;
-  std::vector<bitmask_type const *> input_nm;
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv      = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
 
 #if defined(DEBUG)
   printf("%lu windows for %d columns, %d rows to fit in ",
@@ -1083,26 +1112,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf(" total):\n");
 #endif
 
-  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
-  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
-  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
-
-  std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
-    output_buffers.push_back(std::move(temp));
-  }
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(std::min((uint64_t)1024, total_table_size / 8));
+  dim3 threads(std::min(1024, shmem_limit_per_block / 8));
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
   pretty_print(shmem_limit_per_block);

From a82cee8488b0d7aa61b4361b41c69fdf2bf07ccc Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 8 Jul 2021 01:52:36 +0000
Subject: [PATCH 39/80] Adding row to column conversion code. Performance falls
 off a cliff, but starts out reasonably. I haven't looked at this in nsight
 yet.

---
 .../row_conversion/row_conversion.cpp         |  74 +-
 cpp/include/cudf/row_conversion.hpp           |  12 +
 cpp/src/row_conversion/row_conversion.cu      | 759 +++++++++++++-----
 cpp/tests/row_conversion/row_conversion.cpp   | 106 +++
 4 files changed, 748 insertions(+), 203 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index d6b195433cf..7c1f52c5cd6 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -91,7 +91,7 @@ static void BM_new_to_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-/*static void BM_from_row(benchmark::State& state)
+static void BM_old_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -123,36 +123,62 @@ static void BM_new_to_row(benchmark::State& state)
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}*/
-
-#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
-    ->Unit(benchmark::kMillisecond);
+}
+
+static void BM_new_from_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         256,
+                                         row_count{n_rows});
+
+  std::vector<cudf::data_type> schema;
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    schema.push_back(t);
+    total_bytes += cudf::size_of(t);
+  }
+
+  auto rows = cudf::convert_to_rows(table->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto out = cudf::convert_from_rows2(rows, schema);
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
 
-#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)               \
+  (::benchmark::State & st) { f(st); }                  \
+  BENCHMARK_REGISTER_F(RowConversion, name)             \
+    ->RangeMultiplier(8)                                \
+    ->Ranges({{1 << 6, 1 << 20}})                       \
+    ->UseManualTime()                                   \
     ->Unit(benchmark::kMillisecond);
 
-OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
-#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
-  (::benchmark::State & st) { BM_from_row(st); }   \
+  (::benchmark::State & st) { f(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)        \
     ->RangeMultiplier(8)                           \
-    ->Ranges({{1 << 6, 1 << 22}})                  \
+    ->Ranges({{1 << 6, 1 << 20}})                  \
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index f5e2225ad19..282ffa4b0cb 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -48,4 +48,16 @@ std::unique_ptr<cudf::table> convert_from_rows(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::table> convert_from_rows2(
+  cudf::lists_column_view const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows2(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 }  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 3f221e2f716..c0e78a03576 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -30,6 +30,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cudf/row_conversion.hpp>
+#include <tuple>
 #include "cudf/types.hpp"
 #include "rmm/device_buffer.hpp"
 #include "thrust/iterator/counting_iterator.h"
@@ -332,6 +333,20 @@ struct block_info {
   int buffer_num;
 };
 
+// When building the columns to return, we have to be mindful of the offset limit in cudf.
+// It is 32-bit and these data columns are capable of surpassing that easily. The data should
+// not be cut off exactly at the limit though due to the validity buffers. The most efficient
+// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+// we keep track of the cut points for the validity, which we call row batches. If the row
+// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+// the outgoing data stream.
+struct row_batch {
+  size_type num_bytes;
+  size_type row_count;
+};
+
 /**
  * @brief copy data from cudf columns into x format, which is row-based
  *
@@ -364,7 +379,7 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479;
+  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -383,7 +398,7 @@ __global__ void copy_from_columns(const size_type num_rows,
         }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
-  //else { return; }
+  // else { return; }
   auto block               = block_infos[blockIdx.x];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
@@ -403,7 +418,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            block.buffer_num);
   }
   // each thread is responsible for every threadcount rows of data.
-  // the data is copies into shared memory in the final layout.
+  // the data is copied into shared memory in the final layout.
   auto const real_bytes_in_row =
     col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
   auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
@@ -432,7 +447,7 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const dest_col_offset =
         col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
       if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
-      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) {
+      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
         if (debug_print) {
           printf("shmem row %d(%d) at offset %d(%d)\n",
                  row - block.start_row,
@@ -524,8 +539,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride       = gridDim.x * 8;
-  auto const end_offset = shmem_row_size * rows_in_block;
+  auto const thread_stride       = blockDim.x * 8;
+  auto const end_offset          = shmem_row_size * rows_in_block;
 
   if (debug_print) {
     printf("writing final data from %d to %d at stride %d\n",
@@ -559,9 +574,10 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
       for (auto i = 0; i < num_single_bytes; ++i) {
         if (debug_print) {
-          printf("case 3 - %d single byte final write %p -> %p\n",
+          printf("case 3 - %d single byte final write %p(%d) -> %p\n",
                  num_single_bytes,
                  &input_ptr[i + dest_shim_offset],
+                 input_ptr[i + dest_shim_offset],
                  &output_ptr[i]);
         }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
@@ -600,6 +616,237 @@ __global__ void copy_from_columns(const size_type num_rows,
   }
 }
 
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param offsets
+ * @param output_data
+ * @param output_nm
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_to_columns(const size_type num_rows,
+                                const size_type num_columns,
+                                const size_type *offsets,
+                                int8_t **output_data,
+                                cudf::bitmask_type **output_nm,
+                                const size_type *col_sizes,
+                                const size_type *col_offsets,
+                                const block_info *block_infos,
+                                const int8_t *input_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0;
+
+  if (debug_print) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("Column Info:\n");
+    for (int i = 0; i < num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n",
+             i,
+             output_data[i],
+             col_sizes[i],
+             col_offsets[i]);
+    }
+    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+    /*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+    printf("%d: %d\n", i, row_offsets[i]);
+    }*/
+    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+  }
+//  else { return; }
+  auto block               = block_infos[blockIdx.x];
+  auto const rows_in_block = block.end_row - block.start_row + 1;
+  auto const cols_in_block = block.end_col - block.start_col + 1;
+  extern __shared__ int8_t shared_data[];
+
+  // copy data from our block's window to shared memory
+  // offsets information can get us on the row, then we need to know where the column
+  // starts to offset into the row data.
+
+  // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding
+  // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the
+  // thread may copy less than 8 bytes. Even if at the beginning of the window, because
+  // every internal copy is aligned to 8-byte boundaries.
+  //
+  //  thread 0 thread 1 thread 2 thread 3 thread 4 thread 5
+  //  01234567 89abcdef 01234567 89abcdef 01234567 89abcdef
+  //  xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx
+  // |        |        |        |        |        |        |
+  //
+  //
+
+  auto const window_start_quad = col_offsets[block.start_col] / 8;
+  auto const window_end_quad   = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8;
+  auto const window_quad_width = window_end_quad - window_start_quad;
+  auto const total_quads       = window_quad_width * rows_in_block;
+  auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7;
+
+  if (debug_print) {
+    printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]);
+    printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad);
+    printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad);
+  }
+
+  // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't
+  // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will
+  // result in the window edges being duplicated across blocks, but we can copy the padding as well
+  // to speed up our transfers to shared memory.
+  for (int i = threadIdx.x; i < total_quads; i += blockDim.x) {
+    auto const relative_row = i / window_quad_width;
+    auto const absolute_row = relative_row + block.start_row;
+    //auto const row           = i / window_quad_width;
+    auto const offset_in_row = i % window_quad_width * 8;
+    auto const shmem_dest    = &shared_data[i * 8];
+
+    if (debug_print) {
+      printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest);
+      printf("offsets is %p\n", offsets);
+      printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]);
+      printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row);
+    }
+
+    // full 8-byte copy
+    const int64_t *long_col_input =
+      reinterpret_cast<const int64_t *>(&input_data[offsets[absolute_row] + offset_in_row]);
+    if (debug_print) { 
+      printf("which will be address %p\n", long_col_input);
+      printf("%p <- long %lu\n", shmem_dest, *long_col_input); }
+    *reinterpret_cast<int64_t *>(shmem_dest) = *long_col_input;
+  }
+
+  __syncthreads();
+
+  // now we copy from shared memory to final destination.
+  // the data is laid out in rows in shared memory, so the reads
+  // for a column will be "vertical". Because of this and the different
+  // sizes for each column, this portion is handled on row/column basis.
+  // to prevent each thread working on a single row and also to ensure
+  // that all threads can do work in the case of more threads than rows,
+  // we do a global index instead of a double for loop with col/row.
+  for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+    auto const relative_col = index % cols_in_block;
+    auto const relative_row = index / cols_in_block;
+    auto const absolute_col = relative_col + block.start_col;
+    auto const absolute_row = relative_row + block.start_row;
+
+    auto const shared_memory_row_offset = window_quad_width * 8 * relative_row;
+    auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] +
+                                      shared_memory_row_offset + shared_memory_starting_pad;
+    auto const column_size = col_sizes[absolute_col];
+
+    int8_t *shmem_src = &shared_data[shared_memory_offset];
+    int8_t *dst       = &output_data[absolute_col][absolute_row * column_size];
+
+    if (debug_print) {
+      printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
+      " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size,
+    shmem_src, dst) ;
+    }
+    switch (column_size) {
+      case 1: {
+        if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); }
+        *dst = *shmem_src;
+        break;
+      }
+      case 2: {
+        const int16_t *short_col_input = reinterpret_cast<const int16_t *>(shmem_src);
+        if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); }
+        *reinterpret_cast<int16_t *>(dst) = *short_col_input;
+        break;
+      }
+      case 4: {
+        const int32_t *int_col_input = reinterpret_cast<const int32_t *>(shmem_src);
+        if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); }
+        *reinterpret_cast<int32_t *>(dst) = *int_col_input;
+        break;
+      }
+      case 8: {
+        const int64_t *long_col_input = reinterpret_cast<const int64_t *>(shmem_src);
+        if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); }
+        *reinterpret_cast<int64_t *>(dst) = *long_col_input;
+        break;
+      }
+      default: {
+        if (debug_print) {
+          printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col);
+        }
+        // TODO this should just not be supported for fixed width columns, but just in case...
+        for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; }
+        break;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  // now handle validity. Each thread is responsible for 32 rows in a single column.
+  // to prevent indexing issues with a large number of threads, this is compressed
+  // to a single loop like above. TODO: investigate using shared memory here
+  auto const validity_batches_per_col = (num_rows + 31) / 32;
+  auto const validity_batches_total   = validity_batches_per_col * num_columns;
+  if (debug_print) {
+    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows);
+  }
+  for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) {
+    // what column is this?
+    auto const col             = index / validity_batches_per_col;
+    auto const batch           = index % validity_batches_per_col;
+    auto const starting_row    = batch * 32;
+    auto const validity_offset = col_offsets[num_columns] + col / 8;
+
+    if (debug_print) {
+      printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
+    }
+
+    int32_t dst_validity = 0;
+    for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
+      int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
+
+      if (debug_print) {
+        printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset);
+      }
+  
+      auto const val_byte     = *validity_ptr;
+      auto const src_shift    = col % 8;
+      auto const dst_shift    = row % 32;
+      auto const src_bit_mask = 1 << src_shift;
+      if (debug_print) {
+        printf("src bit mask is 0x%x\n", src_bit_mask);
+        printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift);
+        printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift);
+      }
+//      auto const dst_bit_mask = 1 << dst_shift;
+      dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
+      if (debug_print) {
+        printf("validity is now 0x%x\n", dst_validity);
+      }
+    }
+    
+
+    int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[col] + (starting_row / 32));
+    if (debug_print) {
+      printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32));
+      printf("validity to write is %d\n", dst_validity);
+      printf("validity write %p <- %d\n", validity_ptr, dst_validity);
+    }
+    *validity_ptr         = dst_validity;
+  }
+}
+
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
  * @param [in] num_columns the number of columns being copied.
@@ -764,21 +1011,165 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   return align_offset(at_offset, 8);  // 8 bytes (64 bits)
 }
 
-}  // namespace detail
+template <typename iterator>
+static size_type compute_column_information(
+  iterator begin,
+  iterator end,
+  std::vector<size_type> &column_starts,
+  std::vector<size_type> &column_sizes)//,
+  //std::function<void(T)> nested_type_cb)
+{
+  size_type fixed_width_size_per_row = 0;
+  for (auto cv = begin; cv != end; ++cv) {
+    auto col_type    = std::get<0>(*cv);
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+//    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+
+  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  column_starts.push_back(validity_offset);
+
+  return fixed_width_size_per_row;
+}
 
 //#define DEBUG
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr)
+
+static std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
+                                                 std::vector<size_type> const &column_starts,
+                                                 std::vector<row_batch> const &row_batches,
+                                                 size_type const total_number_of_rows,
+                                                 size_type const &shmem_limit_per_block)
 {
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
-  const size_type num_columns     = tbl.num_columns();
-  const size_type num_rows        = tbl.num_rows();
+  std::vector<block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_width     = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
+                        int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int i                        = 0;
+    while (i < total_number_of_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(detail::block_info{
+        start_col,
+        current_window_start_row,
+        end_col,
+        std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
+        current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+  // would be memory cache line sized access, but since other blocks will read/write the edges this
+  // may not turn out to be overly important. For now, we will attempt to build a square window as
+  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
+  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
+  // bytes, not rows or columns.
+  int const window_height = std::min(
+    std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows),
+    row_batches[0].row_count);
+#if defined(DEBUG)
+  printf(
+    "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height "
+    "%d\n",
+    size_type(sqrt(shmem_limit_per_block)) / column_sizes[0],
+    total_number_of_rows,
+    row_batches[0].row_count,
+    window_height);
+#endif
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
+    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col  = row_size_aligned + col_size;
+    auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
+
+    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
+#if defined(DEBUG)
+      printf(
+        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "for shared mem size %d\n",
+        row_size_with_end_pad * window_height,
+        col,
+        row_size * window_height,
+        current_window_start_col,
+        col - 1,
+        window_height,
+        row_size_with_end_pad,
+        row_size,
+        row_size_aligned,
+        shmem_limit_per_block);
+#endif
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size =
+        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+#if defined(DEBUG)
+      printf(
+        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "or %d)\n",
+        row_size,
+        col_size,
+        row_size + col_size,
+        column_starts[col - 1],
+        column_sizes[col - 1],
+        column_starts[col - 1] + column_sizes[col - 1]);
+#endif
+      row_size += col_size;  // alignment required for shared memory window boundary to match
+                             // alignment of output row
+      current_window_start_col = col;
+      current_window_width     = 0;
+    } else {
+      row_size = row_size_with_this_col;
+      current_window_width++;
+    }
+  }
+
+  // build last set of blocks
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height);
+  }
+
+  return block_infos;
+}
+}  // namespace detail
 
 #if defined(DEBUG)
-  auto pretty_print = [](uint64_t i) {
+  void pretty_print(uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
     } else if (i > (1 * 1024 * 1024)) {
@@ -788,9 +1179,19 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     } else {
       printf("%lu Bytes", i);
     }
-  };
+  }
 #endif
 
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr)
+{
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows    = tbl.num_rows();
+
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
@@ -834,8 +1235,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm   = detail::copy_to_dev_async2(input_nm, stream, mr);
 
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
@@ -848,43 +1249,48 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   column_sizes.reserve(num_columns);
   column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
-  size_type fixed_width_size_per_row = 0;
-  for (int col = 0; col < num_columns; ++col) {
-    auto cv          = tbl.column(col);
-    auto col_type    = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+    return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+  });
+
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+    iter,
+    iter + num_columns,
+    column_starts,
+    column_sizes);//,
+//    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
+  /*  size_type fixed_width_size_per_row = 0;
+    for (int col = 0; col < num_columns; ++col) {
+      auto cv          = tbl.column(col);
+      auto col_type    = cv.type();
+      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+      if (nested_type) { variable_width_columns.push_back(cv); }
+
+      // a list or string column will write a single uint64
+      // of data here for offset/length
+      auto col_size = nested_type ? 8 : size_of(col_type);
+
+      // align size for this type
+      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+      column_starts.push_back(fixed_width_size_per_row);
+      column_sizes.push_back(col_size);
+      fixed_width_size_per_row += col_size;
+    }*/
 
-    if (nested_type) { variable_width_columns.push_back(cv); }
+#if defined(DEBUG)
+  printf("validity offset will be %d + %d = %d\n",
+         column_starts.back(),
+         column_sizes.back(),
+         column_starts.back() + column_sizes.back());
+#endif
 
-    // a list or string column will write a single uint64
-    // of data here for offset/length
-    auto col_size = nested_type ? 8 : size_of(col_type);
 
-    // align size for this type
-    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-    column_starts.push_back(fixed_width_size_per_row);
-    column_sizes.push_back(col_size);
-    fixed_width_size_per_row += col_size;
-  }
+  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
 
-  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
-
-  // When building the columns to return, we have to be mindful of the offset limit in cudf.
-  // It is 32-bit and these data columns are capable of surpassing that easily. The data should
-  // not be cut off exactly at the limit though due to the validity buffers. The most efficient
-  // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-  // we keep track of the cut points for the validity, which we call row batches. If the row
-  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-  // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-  // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-  // the outgoing data stream.
-  struct row_batch {
-    size_type num_bytes;
-    size_type row_count;
-  };
-  std::vector<row_batch> row_batches;
+  std::vector<detail::row_batch> row_batches;
 
   auto calculate_variable_width_row_data_size = [](int const row) {
     // each level of variable-width data will add an offset/length
@@ -936,10 +1342,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
                      calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
 
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits<size_type>::max()) {
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
+        (uint64_t)std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
       row_batches.push_back(
-        row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
       row_batch_size         = 0;
       row_batch_rows         = row_batch_rows & 31;
       row_offset             = 0;
@@ -954,12 +1361,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+    row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
   auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
@@ -979,125 +1386,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
   auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
-  std::vector<detail::block_info> block_infos;
-
-  // block infos are organized with the windows going "down" the columns
-  // this provides the most coalescing of memory access
-  int current_window_width     = 0;
-  int current_window_start_col = 0;
-
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, num_rows](
-                        int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int i                        = 0;
-    while (i < num_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(
-        detail::block_info{start_col,
-                           current_window_start_row,
-                           end_col,
-                           std::min(current_window_start_row + window_height - 1, num_rows - 1),
-                           current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
-  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized
-  // access, but since other blocks will read/write the edges this may not turn out to be overly important.
-  // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size.
-  // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are
-  // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns.
-  int const window_height =
-    std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count);
-#if defined(DEBUG)
-  printf(
-    "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
-    "%d\n",
-    max_window_height,
-    num_rows,
-    row_batches[0].row_count,
-    window_height);
-#endif
-
-  int row_size = 0;
-
-  // march each column and build the blocks of appropriate sizes
-  for (int col = 0; col < num_columns; ++col) {
-    auto const col_size = column_sizes[col];
-
-    // align size for this type
-    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
-    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
-    auto row_size_with_this_col  = row_size_aligned + col_size;
-    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
-
-    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
-#if defined(DEBUG)
-      printf(
-        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
-        "for shared mem size %d\n",
-        row_size_with_end_pad * window_height,
-        col,
-        row_size * window_height,
-        current_window_start_col,
-        col - 1,
-        window_height,
-        row_size_with_end_pad,
-        row_size,
-        row_size_aligned,
-        shmem_limit_per_block);
-#endif
-      // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
-      row_size =
-        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-#if defined(DEBUG)
-      printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
-        "or %d)\n",
-        row_size,
-        col_size,
-        row_size + col_size,
-        column_starts[col - 1],
-        column_sizes[col - 1],
-        column_starts[col - 1] + column_sizes[col - 1]);
-#endif
-      row_size += col_size;  // alignment required for shared memory window boundary to match
-                             // alignment of output row
-      current_window_start_col = col;
-      current_window_width     = 0;
-    } else {
-      row_size = row_size_with_this_col;
-      current_window_width++;
-    }
-  }
-
-#if defined(DEBUG)
-  printf("validity offset will be %d + %d = %d\n",
-         column_starts.back(),
-         column_sizes.back(),
-         column_starts.back() + column_sizes.back());
-#endif
-  auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4);
-  column_starts.push_back(validity_offset);
-
-  // build last set of blocks
-  if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
-  }
-
+  std::vector<detail::block_info> block_infos =
+    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
 #if defined(DEBUG)
   printf("%lu windows for %d columns, %d rows to fit in ",
@@ -1116,7 +1406,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(std::min(1024, shmem_limit_per_block / 8));
+  #if defined(DEBUG) || 1
+  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size));
+  #else
+  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size));
+  #endif
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
   pretty_print(shmem_limit_per_block);
@@ -1206,11 +1500,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    zero->set_valid(true, stream);
+    zero->set_valid_async(true, stream);
     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
 
     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    step->set_valid(true, stream);
+    step->set_valid_async(true, stream);
     static_cast<ScalarType *>(step.get())
       ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
 
@@ -1238,6 +1532,97 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   }
 }
 
+std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &input,
+                                                std::vector<cudf::data_type> const &schema,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource *mr)
+{
+  // verify that the types are what we expect
+  cudf::column_view child = input.child();
+  cudf::type_id list_type = child.type().id();
+  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  cudf::size_type num_columns = schema.size();
+  cudf::size_type num_rows    = input.parent().size();
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int shmem_limit_per_block;
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  std::vector<cudf::size_type> column_starts;
+  std::vector<cudf::size_type> column_sizes;
+
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
+    return std::make_tuple(schema[i], nullptr);
+  });
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+    iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
+
+  size_type validity_size = (num_columns + 7) / 8;
+
+  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
+
+  // Ideally we would check that the offsets are all the same, etc. but for now
+  // this is probably fine
+  CUDF_EXPECTS(row_size * num_rows == child.size(),
+               "The layout of the data appears to be off");
+  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+
+  // build the row_batches from the passed in list column
+  std::vector<detail::row_batch> row_batches;
+
+  row_batches.push_back(detail::row_batch{child.size(), num_rows});
+
+  // Allocate the columns we are going to write into
+  std::vector<std::unique_ptr<cudf::column>> output_columns;
+  std::vector<int8_t *> output_data;
+  std::vector<cudf::bitmask_type *> output_nm;
+  for (cudf::size_type i = 0; i < num_columns; i++) {
+    auto column = cudf::make_fixed_width_column(
+      schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
+    auto mut = column->mutable_view();
+    output_data.emplace_back(mut.data<int8_t>());
+    output_nm.emplace_back(mut.null_mask());
+    output_columns.emplace_back(std::move(column));
+  }
+
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_nm   = detail::copy_to_dev_async2(output_nm, stream, mr);
+
+  std::vector<detail::block_info> block_infos =
+    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+
+  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+
+  dim3 blocks(block_infos.size());
+  #if defined(DEBUG) || 1
+  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
+  #else
+  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size()));
+  #endif
+#if defined(DEBUG)
+  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared memory\n");
+#endif
+  detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+    num_rows,
+    num_columns,
+    input.offsets().data<size_type>(),
+    dev_output_data.data(),
+    dev_output_nm.data(),
+    dev_col_sizes.data(),
+    dev_col_starts.data(),
+    dev_block_infos.data(),
+    child.data<int8_t>());
+
+  return std::make_unique<cudf::table>(std::move(output_columns));
+}
+
 std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
                                                std::vector<cudf::data_type> const &schema,
                                                rmm::cuda_stream_view stream,
@@ -1318,4 +1703,20 @@ std::unique_ptr<cudf::table> convert_from_rows(
   //    }
 }
 
+std::unique_ptr<cudf::table> convert_from_rows2(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
+
+  //    for (uint i=0; i<input.size(); ++i) {
+  cudf::lists_column_view lcv = input[0]->view();
+  auto ret                    = convert_from_rows2(lcv, schema, stream, mr);
+
+  return ret;
+  //    }
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index c02f83ad1d5..818d7a89ddb 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -21,9 +21,13 @@
 
 #include <thrust/iterator/counting_iterator.h>
 #include <cudf/row_conversion.hpp>
+#include "cudf/lists/lists_column_view.hpp"
+#include "cudf/types.hpp"
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {
 };
+struct RowToColumnTests : public cudf::test::BaseFixture {
+};
 
 TEST_F(ColumnToRowTests, Single)
 {
@@ -108,3 +112,105 @@ TEST_F(ColumnToRowTests, SingleByteWide)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
 }
+
+TEST_F(RowToColumnTests, Single)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Simple)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Tall)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Wide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, SingleByteWide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}

From 31f3b4ae1cde747a8ccc8116f6c046f7161e71c2 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 8 Jul 2021 20:45:18 +0000
Subject: [PATCH 40/80] updating to use make_device_uvector_async and bitmask
 functions per review comments

---
 cpp/src/row_conversion/row_conversion.cu | 125 +++++++++--------------
 1 file changed, 47 insertions(+), 78 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index c0e78a03576..c73e967cf0f 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -21,6 +21,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -36,6 +37,7 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
+using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
 namespace detail {
@@ -45,32 +47,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-/**
- * Copy a simple vector to device memory asynchronously. Be sure to read
- * the data on the same stream as is used to copy it.
- */
-template <typename T>
-std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
-                                                          rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource *mr)
-{
-  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
-  CUDA_TRY(cudaMemcpyAsync(
-    ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
-  return ret;
-}
-
-template <typename T>
-rmm::device_uvector<T> copy_to_dev_async2(const std::vector<T> &input,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource *mr)
-{
-  rmm::device_uvector<T> ret(input.size(), stream, mr);
-  CUDA_TRY(cudaMemcpyAsync(
-    ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
-  return ret;
-}
-
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type num_columns,
                                             const cudf::size_type row_size,
@@ -180,8 +156,8 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         }
 
         cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
+        cudf::size_type byte_bit_offset = intra_word_index(col_index);
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
         if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
@@ -278,8 +254,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
+        cudf::size_type byte_bit_offset = intra_word_index(col_index);
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -505,8 +481,8 @@ __global__ void copy_from_columns(const size_type num_rows,
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
         int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
-        cudf::size_type byte_bit_offset = col % 8;
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)];
+        cudf::size_type byte_bit_offset = intra_word_index(col);
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -648,7 +624,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0;
+  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -806,7 +782,7 @@ __global__ void copy_to_columns(const size_type num_rows,
     auto const col             = index / validity_batches_per_col;
     auto const batch           = index % validity_batches_per_col;
     auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + col / 8;
+    auto const validity_offset = col_offsets[num_columns] + word_index(col);
 
     if (debug_print) {
       printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
@@ -821,7 +797,7 @@ __global__ void copy_to_columns(const size_type num_rows,
       }
   
       auto const val_byte     = *validity_ptr;
-      auto const src_shift    = col % 8;
+      auto const src_shift    = intra_word_index(col);
       auto const dst_shift    = row % 32;
       auto const src_bit_mask = 1 << src_shift;
       if (debug_print) {
@@ -920,10 +896,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   const cudf::size_type num_rows,
   const cudf::size_type num_columns,
   const cudf::size_type size_per_row,
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
-  std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
-  std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
+  rmm::device_uvector<cudf::size_type> &column_start,
+  rmm::device_uvector<cudf::size_type> &column_size,
+  rmm::device_uvector<const int8_t *> &input_data,
+  rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
   const cudf::scalar &zero,
   const cudf::scalar &scalar_size_per_row,
   rmm::cuda_stream_view stream,
@@ -954,10 +930,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
     num_rows,
     num_columns,
     size_per_row,
-    column_start->data(),
-    column_size->data(),
-    input_data->data(),
-    input_nm->data(),
+    column_start.data(),
+    column_size.data(),
+    input_data.data(),
+    input_nm.data(),
     data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows,
@@ -1004,7 +980,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  int32_t validity_bytes_needed = word_index(schema.size() + 7);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1235,8 +1211,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm   = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+  auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
@@ -1287,8 +1263,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 #endif
 
 
-  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
   std::vector<detail::row_batch> row_batches;
 
@@ -1322,16 +1298,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   size_type row_batch_rows  = 0;
   uint64_t row_offset       = 0;
 
-  auto calculate_validity_size = [](int const num_cols) {
-    // Now we need to add in space for validity
-    // Eventually we can think about nullable vs not nullable, but for now we will just always add
-    // it in
-    return (num_cols + 7) / 8;
-  };
-
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = calculate_validity_size(num_columns);
+  auto validity_size = num_bitmask_words(num_columns);
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -1364,7 +1333,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
 
 #if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
@@ -1384,7 +1353,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   std::vector<detail::block_info> block_infos =
     build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
@@ -1402,7 +1371,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf(" total):\n");
 #endif
 
-  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
@@ -1443,7 +1412,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
     auto offsets     = std::make_unique<column>(
       data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
@@ -1477,8 +1446,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     std::vector<cudf::size_type> column_size;
 
     int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
+    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
 
     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
@@ -1495,8 +1464,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
-    auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr);
-    auto dev_input_nm   = detail::copy_to_dev_async(input_nm, stream, mr);
+    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
@@ -1561,7 +1530,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   size_type fixed_width_size_per_row = detail::compute_column_information(
     iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
 
-  size_type validity_size = (num_columns + 7) / 8;
+  size_type validity_size = num_bitmask_words(num_columns);
 
   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
 
@@ -1569,8 +1538,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   // this is probably fine
   CUDF_EXPECTS(row_size * num_rows == child.size(),
                "The layout of the data appears to be off");
-  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
-  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
@@ -1590,13 +1559,13 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
     output_columns.emplace_back(std::move(column));
   }
 
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
-  auto dev_output_nm   = detail::copy_to_dev_async2(output_nm, stream, mr);
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+  auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
   std::vector<detail::block_info> block_infos =
     build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
-  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   dim3 blocks(block_infos.size());
   #if defined(DEBUG) || 1
@@ -1647,8 +1616,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     // this is probably fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream);
+    auto dev_column_size = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -1663,8 +1632,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       output_columns.emplace_back(std::move(column));
     }
 
-    auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr);
-    auto dev_output_nm   = detail::copy_to_dev_async(output_nm, stream, mr);
+    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
@@ -1675,10 +1644,10 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       num_rows,
       num_columns,
       size_per_row,
-      dev_column_start->data(),
-      dev_column_size->data(),
-      dev_output_data->data(),
-      dev_output_nm->data(),
+      dev_column_start.data(),
+      dev_column_size.data(),
+      dev_output_data.data(),
+      dev_output_nm.data(),
       child.data<int8_t>());
 
     return std::make_unique<cudf::table>(std::move(output_columns));

From b044f8b10c606b495bbb4284a754f50f4a6eb7a4 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 13 Jul 2021 07:18:49 +0000
Subject: [PATCH 41/80] updating conversion code. Found out bit operations are
 on 32-bit values, so they can't be used since row data has byte-aligned
 validity. Performance improvements on the row to column side.

---
 cpp/src/row_conversion/row_conversion.cu | 106 ++++++++++++-----------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index c73e967cf0f..0879a1c50a5 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -37,6 +37,8 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
+#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2)
+
 using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
@@ -156,11 +158,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         }
 
         cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
-        cudf::size_type byte_bit_offset = intra_word_index(col_index);
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
+        if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; }
       }  // end column loop
     }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
@@ -254,8 +256,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
-        cudf::size_type byte_bit_offset = intra_word_index(col_index);
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -481,8 +483,8 @@ __global__ void copy_from_columns(const size_type num_rows,
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
         int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)];
-        cudf::size_type byte_bit_offset = intra_word_index(col);
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col  / 8)];
+        cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -597,6 +599,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  *
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
  * @param offsets
  * @param output_data
  * @param output_nm
@@ -608,6 +611,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  */
 __global__ void copy_to_columns(const size_type num_rows,
                                 const size_type num_columns,
+                                const size_type shmem_used_per_block,
                                 const size_type *offsets,
                                 int8_t **output_data,
                                 cudf::bitmask_type **output_nm,
@@ -624,18 +628,10 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
+  constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("Column Info:\n");
-    for (int i = 0; i < num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n",
-             i,
-             output_data[i],
-             col_sizes[i],
-             col_offsets[i]);
-    }
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
     /*    printf("Row Offsets:\n");
     for (int i=0; i<num_rows; ++i) {
@@ -644,7 +640,13 @@ __global__ void copy_to_columns(const size_type num_rows,
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
 //  else { return; }
-  auto block               = block_infos[blockIdx.x];
+
+  for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) {
+    auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset;
+    if (this_block_index > blockDim.x) {
+      break;
+    }
+    auto block               = block_infos[this_block_index];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   auto const cols_in_block = block.end_col - block.start_col + 1;
   extern __shared__ int8_t shared_data[];
@@ -767,61 +769,58 @@ __global__ void copy_to_columns(const size_type num_rows,
     }
   }
 
-  __syncthreads();
-
-  // now handle validity. Each thread is responsible for 32 rows in a single column.
+  // now handle validity. Each thread is responsible for 32 rows in 8 columns.
   // to prevent indexing issues with a large number of threads, this is compressed
   // to a single loop like above. TODO: investigate using shared memory here
   auto const validity_batches_per_col = (num_rows + 31) / 32;
-  auto const validity_batches_total   = validity_batches_per_col * num_columns;
-  if (debug_print) {
-    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows);
+  auto const validity_batches_total   = std::max(1, validity_batches_per_col * (num_columns / 8));
+  if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) {
+    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x);
   }
-  for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) {
-    // what column is this?
-    auto const col             = index / validity_batches_per_col;
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) {
+    auto const start_col       = (index * 8) / validity_batches_per_col;
     auto const batch           = index % validity_batches_per_col;
     auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + word_index(col);
+    auto const validity_offset = col_offsets[num_columns] + (start_col / 8);
 
     if (debug_print) {
-      printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
+      printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x);
     }
 
-    int32_t dst_validity = 0;
+    // one for each column
+    int32_t dst_validity[8] = {0};
     for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
       int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
 
       if (debug_print) {
-        printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset);
+        printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row);
       }
   
       auto const val_byte     = *validity_ptr;
-      auto const src_shift    = intra_word_index(col);
-      auto const dst_shift    = row % 32;
-      auto const src_bit_mask = 1 << src_shift;
-      if (debug_print) {
-        printf("src bit mask is 0x%x\n", src_bit_mask);
-        printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift);
-        printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift);
-      }
-//      auto const dst_bit_mask = 1 << dst_shift;
-      dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
-      if (debug_print) {
-        printf("validity is now 0x%x\n", dst_validity);
+
+      for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
+        auto const src_shift    = (start_col + i) % 8;
+        auto const dst_shift    = row % 32;
+        auto const src_bit_mask = 1 << src_shift;
+        if (debug_print) {
+          printf("%d-%d: src bit mask is 0x%x, src shift is 0x%x and dst shift is 0x%x, validity bit is 0x%x\n", threadIdx.x, blockIdx.x, src_bit_mask, src_shift, dst_shift, (val_byte & src_bit_mask) >> src_shift);
+        }
+  //      auto const dst_bit_mask = 1 << dst_shift;
+        dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
       }
     }
     
 
-    int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[col] + (starting_row / 32));
-    if (debug_print) {
-      printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32));
-      printf("validity to write is %d\n", dst_validity);
-      printf("validity write %p <- %d\n", validity_ptr, dst_validity);
+    for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
+      int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[start_col + i] + (starting_row / 32));
+      if (debug_print) {
+        printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]);
+      }
+      *validity_ptr         = dst_validity[i];
     }
-    *validity_ptr         = dst_validity;
   }
 }
+}
 
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
@@ -980,7 +979,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = word_index(schema.size() + 7);
+  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1300,7 +1299,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = num_bitmask_words(num_columns);
+  auto validity_size = num_bitmask_words(num_columns) * 4;
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -1521,6 +1520,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
+  shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS;
+
   std::vector<cudf::size_type> column_starts;
   std::vector<cudf::size_type> column_sizes;
 
@@ -1530,7 +1531,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   size_type fixed_width_size_per_row = detail::compute_column_information(
     iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
 
-  size_type validity_size = num_bitmask_words(num_columns);
+  size_type validity_size = num_bitmask_words(num_columns) * 4;
 
   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
 
@@ -1567,7 +1568,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
-  dim3 blocks(block_infos.size());
+  dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
   #if defined(DEBUG) || 1
   dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
   #else
@@ -1581,6 +1582,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
     num_rows,
     num_columns,
+    shmem_limit_per_block,
     input.offsets().data<size_type>(),
     dev_output_data.data(),
     dev_output_nm.data(),

From d2a33ed396e935a9b1c8ca44df26b51bc37e2d9b Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 7 Jun 2021 08:14:52 +0000
Subject: [PATCH 42/80] working on row and column conversions

---
 cpp/benchmarks/CMakeLists.txt                 |   41 +-
 .../row_conversion/row_conversion.cpp         |  106 +-
 cpp/include/cudf/row_conversion.hpp           |   12 -
 cpp/src/row_conversion/row_conversion.cu      | 1183 +++++------------
 4 files changed, 357 insertions(+), 985 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index fa1e61e26fd..a8f075d2464 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -22,10 +22,21 @@ target_compile_options(
                       "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
 
+<<<<<<< HEAD
 target_link_libraries(
   cudf_datagen PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
                       benchmark::benchmark nvbench::nvbench Threads::Threads cudf
 )
+=======
+target_link_libraries(cudf_datagen
+               PUBLIC GTest::gmock
+                      GTest::gtest
+                      GTest::gmock_main
+                      GTest::gtest_main
+                      benchmark::benchmark
+                      Threads::Threads
+                      cudf)
+>>>>>>> working on row and column conversions
 
 target_include_directories(
   cudf_datagen
@@ -46,6 +57,7 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen)
 # This function takes in a benchmark name and benchmark source and handles setting all of the
 # associated properties and linking to build the benchmark
 function(ConfigureBench CMAKE_BENCH_NAME)
+<<<<<<< HEAD
   add_executable(${CMAKE_BENCH_NAME} ${ARGN})
   set_target_properties(
     ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
@@ -71,6 +83,17 @@ endfunction()
 
 # ##################################################################################################
 # * column benchmarks -----------------------------------------------------------------------------
+=======
+    add_executable(${CMAKE_BENCH_NAME} ${ARGN})
+    set_target_properties(${CMAKE_BENCH_NAME}
+        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gbenchmarks>")
+    target_link_libraries(${CMAKE_BENCH_NAME}
+        PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main)
+endfunction()
+
+###################################################################################################
+# - column benchmarks -----------------------------------------------------------------------------
+>>>>>>> working on row and column conversions
 ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp)
 
 # ##################################################################################################
@@ -81,12 +104,17 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu)
 # * scatter benchmark -----------------------------------------------------------------------------
 ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu)
 
+<<<<<<< HEAD
 # ##################################################################################################
 # * lists scatter benchmark -----------------------------------------------------------------------
 ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu)
 
 # ##################################################################################################
 # * contiguous_split benchmark  -------------------------------------------------------------------
+=======
+###################################################################################################
+# - contiguous_split benchmark  -------------------------------------------------------------------
+>>>>>>> working on row and column conversions
 ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
 
 # ##################################################################################################
@@ -110,10 +138,16 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask_ben
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchmark.cpp)
 
+<<<<<<< HEAD
 # ##################################################################################################
 # * join benchmark --------------------------------------------------------------------------------
 ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
 ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
+=======
+###################################################################################################
+# - join benchmark --------------------------------------------------------------------------------
+ConfigureBench(JOIN_BENCH join/join_benchmark.cu)
+>>>>>>> working on row and column conversions
 
 # ##################################################################################################
 # * iterator benchmark ----------------------------------------------------------------------------
@@ -239,7 +273,6 @@ ConfigureBench(
   string/factory_benchmark.cu
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
-  string/repeat_strings_benchmark.cpp
   string/replace_benchmark.cpp
   string/replace_re_benchmark.cpp
   string/split_benchmark.cpp
@@ -248,6 +281,7 @@ ConfigureBench(
   string/url_decode_benchmark.cpp
 )
 
+<<<<<<< HEAD
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
 ConfigureBench(JSON_BENCH string/json_benchmark.cpp)
@@ -255,3 +289,8 @@ ConfigureBench(JSON_BENCH string/json_benchmark.cpp)
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp)
+=======
+###################################################################################################
+# - row conversion benchmark ----------------------------------------------------------------------------
+ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp)
+>>>>>>> working on row and column conversions
diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 7c1f52c5cd6..c4edee91b3c 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -25,7 +25,7 @@
 class RowConversion : public cudf::benchmark {
 };
 
-static void BM_old_to_row(benchmark::State& state)
+static void BM_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -37,44 +37,8 @@ static void BM_old_to_row(benchmark::State& state)
                                           cudf::type_id::UINT16,
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
-                                         212,
+                                         50,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-    64,
-    row_count{n_rows});*/
-
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    total_bytes += cudf::size_of(t);
-  }
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto rows = cudf::convert_to_rows(table->view());
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-static void BM_new_to_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         212,
-                                         row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-    64,
-    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -85,13 +49,14 @@ static void BM_new_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
+//    auto rows = cudf::convert_to_rows(table->view());
     auto new_rows = cudf::convert_to_rows2(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-static void BM_old_from_row(benchmark::State& state)
+static void BM_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -105,6 +70,9 @@ static void BM_old_from_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          256,
                                          row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+                                           4,
+                                           row_count{n_rows});*/
 
   std::vector<cudf::data_type> schema;
   cudf::size_type total_bytes = 0;
@@ -125,60 +93,24 @@ static void BM_old_from_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-static void BM_new_from_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         256,
-                                         row_count{n_rows});
-
-  std::vector<cudf::data_type> schema;
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    schema.push_back(t);
-    total_bytes += cudf::size_of(t);
-  }
-
-  auto rows = cudf::convert_to_rows(table->view());
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto out = cudf::convert_from_rows2(rows, schema);
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)               \
-  (::benchmark::State & st) { f(st); }                  \
-  BENCHMARK_REGISTER_F(RowConversion, name)             \
-    ->RangeMultiplier(8)                                \
-    ->Ranges({{1 << 6, 1 << 20}})                       \
-    ->UseManualTime()                                   \
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+  BENCHMARK_DEFINE_F(RowConversion, name)        \
+  (::benchmark::State & st) { BM_to_row(st); }   \
+  BENCHMARK_REGISTER_F(RowConversion, name)      \
+    ->RangeMultiplier(8)                         \
+    ->Ranges({{1 << 16, 1 << 24}})               \
+    ->UseManualTime()                            \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
 
-#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
-  (::benchmark::State & st) { f(st); }   \
+  (::benchmark::State & st) { BM_from_row(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)        \
     ->RangeMultiplier(8)                           \
-    ->Ranges({{1 << 6, 1 << 20}})                  \
+    ->Ranges({{1 << 6, 1 << 22}})                  \
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index 282ffa4b0cb..f5e2225ad19 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -48,16 +48,4 @@ std::unique_ptr<cudf::table> convert_from_rows(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::table> convert_from_rows2(
-  cudf::lists_column_view const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows2(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
 }  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 0879a1c50a5..fb5dc4cb38d 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -14,14 +14,12 @@
  * limitations under the License.
  */
 
-#include <algorithm>
 #include <iostream>
 #include <iterator>
 #include <limits>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -31,15 +29,11 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cudf/row_conversion.hpp>
-#include <tuple>
 #include "cudf/types.hpp"
 #include "rmm/device_buffer.hpp"
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
-#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2)
-
-using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
 namespace detail {
@@ -49,6 +43,34 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
+
+/**
+ * Copy a simple vector to device memory asynchronously. Be sure to read
+ * the data on the same stream as is used to copy it.
+ */
+template <typename T>
+std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource *mr)
+{
+  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
+  CUDA_TRY(cudaMemcpyAsync(
+    ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
+  return ret;
+}
+
+template <typename T>
+rmm::device_uvector<T> copy_to_dev_async2(
+  const std::vector<T> &input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  rmm::device_uvector<T> ret(input.size(), stream, mr);
+  CUDA_TRY(cudaMemcpyAsync(
+    ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
+  return ret;
+}
+
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type num_columns,
                                             const cudf::size_type row_size,
@@ -162,7 +184,7 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         cudf::size_type byte_bit_offset = col_index % 8;
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; }
+        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
       }  // end column loop
     }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
@@ -311,20 +333,6 @@ struct block_info {
   int buffer_num;
 };
 
-// When building the columns to return, we have to be mindful of the offset limit in cudf.
-// It is 32-bit and these data columns are capable of surpassing that easily. The data should
-// not be cut off exactly at the limit though due to the validity buffers. The most efficient
-// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-// we keep track of the cut points for the validity, which we call row batches. If the row
-// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-// the outgoing data stream.
-struct row_batch {
-  size_type num_bytes;
-  size_type row_count;
-};
-
 /**
  * @brief copy data from cudf columns into x format, which is row-based
  *
@@ -337,16 +345,16 @@ struct row_batch {
  * @param block_infos information about the blocks of work
  * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data
- *
+ * 
  */
-__global__ void copy_from_columns(const size_type num_rows,
-                                  const size_type num_columns,
+__global__ void copy_from_columns(const cudf::size_type num_rows,
+                                  const cudf::size_type num_columns,
                                   const int8_t **input_data,
-                                  const bitmask_type **input_nm,
-                                  const size_type *col_sizes,
-                                  const size_type *col_offsets,
+                                  const cudf::bitmask_type **input_nm,
+                                  const cudf::size_type *col_sizes,
+                                  const cudf::size_type *col_offsets,
                                   const block_info *block_infos,
-                                  const size_type *row_offsets,
+                                  const uint64_t *row_offsets,
                                   int8_t **output_data)
 {
   // We are going to copy the data in two passes.
@@ -357,119 +365,46 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
-
-  if (debug_print) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("Column Info:\n");
-    for (int i = 0; i < num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n",
-             i,
-             input_data[i],
-             col_sizes[i],
-             col_offsets[i]);
-    }
-    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-    /*    printf("Row Offsets:\n");
-        for (int i=0; i<num_rows; ++i) {
-          printf("%d: %d\n", i, row_offsets[i]);
-        }*/
-    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
-  }
-  // else { return; }
-  auto block               = block_infos[blockIdx.x];
-  auto const rows_in_block = block.end_row - block.start_row + 1;
+  auto block = block_infos[blockIdx.x];
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
-  uint8_t const dest_shim_offset =
-    reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) &
-    7;  // offset for alignment shim in order to match shared memory with final dest
-  if (debug_print) {
-    printf("outputting to offset %lu\n", output_start_offset);
-    printf("dest shim offset is %d\n", dest_shim_offset);
-    printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
-    printf("my block is %d,%d -> %d,%d - buffer %d\n",
-           block.start_col,
-           block.start_row,
-           block.end_col,
-           block.end_row,
-           block.buffer_num);
-  }
+  uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
+
+    printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x);
+
   // each thread is responsible for every threadcount rows of data.
-  // the data is copied into shared memory in the final layout.
-  auto const real_bytes_in_row =
-    col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
-  auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
-                                           8);  // 8 byte alignment required for shared memory rows
+  // the data is copies into shared memory in the final layout.
+  auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
-  if (debug_print) {
-    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n",
-           block.end_col,
-           col_offsets[block.end_col],
-           block.end_col,
-           col_sizes[block.end_col],
-           block.start_col,
-           col_offsets[block.start_col]);
-    printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row);
-    printf("validity offset is %d\n", validity_offset);
-    printf("starting at %d,%d and going to %d, %d\n",
-           block.start_col,
-           block.start_row,
-           block.end_col,
-           block.end_row);
-  }
-  for (int col = block.start_col; col <= block.end_col; ++col) {
-    /*if (!col_is_variable) */ {
-      uint64_t col_offset      = 0;
+  for (int col=block.start_col; col<=block.end_col; ++col) {
+    /*if (!col_is_variable) */{
+      uint64_t col_offset = 0;
       cudf::size_type col_size = col_sizes[col];
-      auto const dest_col_offset =
-        col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
-      if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
-      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
-        if (debug_print) {
-          printf("shmem row %d(%d) at offset %d(%d)\n",
-                 row - block.start_row,
-                 row,
-                 (row - block.start_row) * shmem_row_size,
-                 row * shmem_row_size);
-        }
-        int8_t *shmem_dest =
-          &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
+      auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
+        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * row];
         switch (col_size) {
           case 1: {
-            if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); }
             *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
-            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_data[col]);
-            if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); }
+            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
-            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_data[col]);
-            if (debug_print) {
-              printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]);
-            }
+            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
-            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_data[col]);
-            if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); }
+            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
-            if (debug_print) {
-              printf("byte for byte copy due to size %d of column %d\n", col_size, col);
-              printf("%p <- input_data[%d] which is %d\n",
-                     shmem_dest,
-                     input_offset,
-                     input_data[col][input_offset]);
-            }
             // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
@@ -482,13 +417,11 @@ __global__ void copy_from_columns(const size_type num_rows,
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
-        int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col  / 8)];
+        int8_t *valid_byte              = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
         cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
-        if (debug_print) { printf("Outputting validity to %p\n", valid_byte); }
         // Now copy validity for the column
         if (input_nm[col]) {
           if (bit_is_set(input_nm[col], row)) {
@@ -500,11 +433,11 @@ __global__ void copy_from_columns(const size_type num_rows,
           // It is valid so just set the bit
           atomicOr_block(valid_int, 1 << int_bit_offset);
         }
-      }  // end row
+      } // end row
 
-      col_offset += col_sizes[col] * rows_in_block;
+      col_offset += col_sizes[col] * (block.end_row - block.start_row);
     }
-  }  // end col
+  } // end col
 
   // wait for the data to be totally copied into shared memory
   __syncthreads();
@@ -517,311 +450,35 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride       = blockDim.x * 8;
-  auto const end_offset          = shmem_row_size * rows_in_block;
-
-  if (debug_print) {
-    printf("writing final data from %d to %d at stride %d\n",
-           thread_start_offset,
-           shmem_row_size * rows_in_block,
-           thread_stride);
-    printf("rows in block %d\n", rows_in_block);
-  }
-  for (auto src_offset = thread_start_offset; src_offset < end_offset;
-       src_offset += thread_stride) {
+  auto const thread_stride = gridDim.x * 8;
+  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
-    auto const row_offset     = row_offsets[block.start_row + output_row_num];
-    auto const col_offset     = src_offset % shmem_row_size;
-    int8_t *output_ptr        = &output_data[block.buffer_num][row_offset + col_offset];
-    int8_t *input_ptr         = &shared_data[src_offset];
-
-    // three cases to worry about here
-    // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front
-    // 2) last 8-byte part of a large row - some bytes of pad at the end
-    // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front
-    // AND potentially pad at the rear
-
-    // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily.
-    // 1st case is when we're at some even multiple of shmem_row_size offset.
-    // 2nd case is when offset + 8 is some even multiple of shmem_row_size.
-    // must be an 8 byte copy
-
-    // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize?
-    if (real_bytes_in_row + dest_shim_offset <= 8) {
-      // case 3, we want to copy real_bytes_in_row bytes
-      auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf("case 3 - %d single byte final write %p(%d) -> %p\n",
-                 num_single_bytes,
-                 &input_ptr[i + dest_shim_offset],
-                 input_ptr[i + dest_shim_offset],
-                 &output_ptr[i]);
-        }
-        output_ptr[i] = input_ptr[i + dest_shim_offset];
-      }
-    } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
-      // first byte with leading pad
+    auto const row_offset = row_offsets[block.start_row + output_row_num];
+    auto const col_offset = src_offset % shmem_row_size;
+    int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset];
+    int8_t *input_ptr = &shared_data[src_offset];
+    // the first part and last part of the row is unaligned data copy. This is copied a single byte
+    // at a time.
+    if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
+      // first part of a row, copy single bytes
       auto const num_single_bytes = 8 - dest_shim_offset;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf(
-            "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]);
-        }
+      for (auto i=0; i<num_single_bytes; ++i) {
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
-    } else if ((src_offset + 8) % shmem_row_size == 0 &&
-               (real_bytes_in_row + dest_shim_offset) % 8 > 0) {
-      // last bytes of a row
-      auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf("single trailing byte final write %p -> %p\n",
-                 &input_ptr[i + dest_shim_offset],
-                 &output_ptr[i]);
-        }
+    } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) {
+      // last part of a row, copy single bytes
+      auto const num_single_bytes = dest_shim_offset;
+      for (auto i=0; i<num_single_bytes; ++i) {
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
     } else {
       // copy 8 bytes aligned
-      const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_ptr);
-      if (debug_print) {
-        printf(
-          "long final write %p -> %p\n", long_col_input, reinterpret_cast<int64_t *>(output_ptr));
-      }
+      const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_ptr);
       *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
     }
   }
 }
 
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
- * @param output_data
- * @param output_nm
- * @param col_sizes array of sizes for each element in a column - one per column
- * @param col_offsets offset into input data row for each column's start
- * @param block_infos information about the blocks of work
- * @param input_data pointer to input data
- *
- */
-__global__ void copy_to_columns(const size_type num_rows,
-                                const size_type num_columns,
-                                const size_type shmem_used_per_block,
-                                const size_type *offsets,
-                                int8_t **output_data,
-                                cudf::bitmask_type **output_nm,
-                                const size_type *col_sizes,
-                                const size_type *col_offsets,
-                                const block_info *block_infos,
-                                const int8_t *input_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // This has been broken up for us in the block_info struct, so we don't have
-  // any calculation to do here, but it is important to note.
-
-  constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
-
-  if (debug_print) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-    /*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-    printf("%d: %d\n", i, row_offsets[i]);
-    }*/
-    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
-  }
-//  else { return; }
-
-  for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) {
-    auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset;
-    if (this_block_index > blockDim.x) {
-      break;
-    }
-    auto block               = block_infos[this_block_index];
-  auto const rows_in_block = block.end_row - block.start_row + 1;
-  auto const cols_in_block = block.end_col - block.start_col + 1;
-  extern __shared__ int8_t shared_data[];
-
-  // copy data from our block's window to shared memory
-  // offsets information can get us on the row, then we need to know where the column
-  // starts to offset into the row data.
-
-  // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding
-  // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the
-  // thread may copy less than 8 bytes. Even if at the beginning of the window, because
-  // every internal copy is aligned to 8-byte boundaries.
-  //
-  //  thread 0 thread 1 thread 2 thread 3 thread 4 thread 5
-  //  01234567 89abcdef 01234567 89abcdef 01234567 89abcdef
-  //  xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx
-  // |        |        |        |        |        |        |
-  //
-  //
-
-  auto const window_start_quad = col_offsets[block.start_col] / 8;
-  auto const window_end_quad   = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8;
-  auto const window_quad_width = window_end_quad - window_start_quad;
-  auto const total_quads       = window_quad_width * rows_in_block;
-  auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7;
-
-  if (debug_print) {
-    printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]);
-    printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad);
-    printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad);
-  }
-
-  // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't
-  // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will
-  // result in the window edges being duplicated across blocks, but we can copy the padding as well
-  // to speed up our transfers to shared memory.
-  for (int i = threadIdx.x; i < total_quads; i += blockDim.x) {
-    auto const relative_row = i / window_quad_width;
-    auto const absolute_row = relative_row + block.start_row;
-    //auto const row           = i / window_quad_width;
-    auto const offset_in_row = i % window_quad_width * 8;
-    auto const shmem_dest    = &shared_data[i * 8];
-
-    if (debug_print) {
-      printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest);
-      printf("offsets is %p\n", offsets);
-      printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]);
-      printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row);
-    }
-
-    // full 8-byte copy
-    const int64_t *long_col_input =
-      reinterpret_cast<const int64_t *>(&input_data[offsets[absolute_row] + offset_in_row]);
-    if (debug_print) { 
-      printf("which will be address %p\n", long_col_input);
-      printf("%p <- long %lu\n", shmem_dest, *long_col_input); }
-    *reinterpret_cast<int64_t *>(shmem_dest) = *long_col_input;
-  }
-
-  __syncthreads();
-
-  // now we copy from shared memory to final destination.
-  // the data is laid out in rows in shared memory, so the reads
-  // for a column will be "vertical". Because of this and the different
-  // sizes for each column, this portion is handled on row/column basis.
-  // to prevent each thread working on a single row and also to ensure
-  // that all threads can do work in the case of more threads than rows,
-  // we do a global index instead of a double for loop with col/row.
-  for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
-    auto const relative_col = index % cols_in_block;
-    auto const relative_row = index / cols_in_block;
-    auto const absolute_col = relative_col + block.start_col;
-    auto const absolute_row = relative_row + block.start_row;
-
-    auto const shared_memory_row_offset = window_quad_width * 8 * relative_row;
-    auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] +
-                                      shared_memory_row_offset + shared_memory_starting_pad;
-    auto const column_size = col_sizes[absolute_col];
-
-    int8_t *shmem_src = &shared_data[shared_memory_offset];
-    int8_t *dst       = &output_data[absolute_col][absolute_row * column_size];
-
-    if (debug_print) {
-      printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
-      " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size,
-    shmem_src, dst) ;
-    }
-    switch (column_size) {
-      case 1: {
-        if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); }
-        *dst = *shmem_src;
-        break;
-      }
-      case 2: {
-        const int16_t *short_col_input = reinterpret_cast<const int16_t *>(shmem_src);
-        if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); }
-        *reinterpret_cast<int16_t *>(dst) = *short_col_input;
-        break;
-      }
-      case 4: {
-        const int32_t *int_col_input = reinterpret_cast<const int32_t *>(shmem_src);
-        if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); }
-        *reinterpret_cast<int32_t *>(dst) = *int_col_input;
-        break;
-      }
-      case 8: {
-        const int64_t *long_col_input = reinterpret_cast<const int64_t *>(shmem_src);
-        if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); }
-        *reinterpret_cast<int64_t *>(dst) = *long_col_input;
-        break;
-      }
-      default: {
-        if (debug_print) {
-          printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col);
-        }
-        // TODO this should just not be supported for fixed width columns, but just in case...
-        for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; }
-        break;
-      }
-    }
-  }
-
-  // now handle validity. Each thread is responsible for 32 rows in 8 columns.
-  // to prevent indexing issues with a large number of threads, this is compressed
-  // to a single loop like above. TODO: investigate using shared memory here
-  auto const validity_batches_per_col = (num_rows + 31) / 32;
-  auto const validity_batches_total   = std::max(1, validity_batches_per_col * (num_columns / 8));
-  if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) {
-    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x);
-  }
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) {
-    auto const start_col       = (index * 8) / validity_batches_per_col;
-    auto const batch           = index % validity_batches_per_col;
-    auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + (start_col / 8);
-
-    if (debug_print) {
-      printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x);
-    }
-
-    // one for each column
-    int32_t dst_validity[8] = {0};
-    for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
-      int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
-
-      if (debug_print) {
-        printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row);
-      }
-  
-      auto const val_byte     = *validity_ptr;
-
-      for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
-        auto const src_shift    = (start_col + i) % 8;
-        auto const dst_shift    = row % 32;
-        auto const src_bit_mask = 1 << src_shift;
-        if (debug_print) {
-          printf("%d-%d: src bit mask is 0x%x, src shift is 0x%x and dst shift is 0x%x, validity bit is 0x%x\n", threadIdx.x, blockIdx.x, src_bit_mask, src_shift, dst_shift, (val_byte & src_bit_mask) >> src_shift);
-        }
-  //      auto const dst_bit_mask = 1 << dst_shift;
-        dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
-      }
-    }
-    
-
-    for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
-      int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[start_col + i] + (starting_row / 32));
-      if (debug_print) {
-        printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]);
-      }
-      *validity_ptr         = dst_validity[i];
-    }
-  }
-}
-}
-
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
  * @param [in] num_columns the number of columns being copied.
@@ -895,10 +552,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   const cudf::size_type num_rows,
   const cudf::size_type num_columns,
   const cudf::size_type size_per_row,
-  rmm::device_uvector<cudf::size_type> &column_start,
-  rmm::device_uvector<cudf::size_type> &column_size,
-  rmm::device_uvector<const int8_t *> &input_data,
-  rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
+  std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
+  std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
   const cudf::scalar &zero,
   const cudf::scalar &scalar_size_per_row,
   rmm::cuda_stream_view stream,
@@ -929,10 +586,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
     num_rows,
     num_columns,
     size_per_row,
-    column_start.data(),
-    column_size.data(),
-    input_data.data(),
-    input_nm.data(),
+    column_start->data(),
+    column_size->data(),
+    input_data->data(),
+    input_nm->data(),
     data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows,
@@ -986,165 +643,21 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   return align_offset(at_offset, 8);  // 8 bytes (64 bits)
 }
 
-template <typename iterator>
-static size_type compute_column_information(
-  iterator begin,
-  iterator end,
-  std::vector<size_type> &column_starts,
-  std::vector<size_type> &column_sizes)//,
-  //std::function<void(T)> nested_type_cb)
-{
-  size_type fixed_width_size_per_row = 0;
-  for (auto cv = begin; cv != end; ++cv) {
-    auto col_type    = std::get<0>(*cv);
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-//    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
-
-    // a list or string column will write a single uint64
-    // of data here for offset/length
-    auto col_size = nested_type ? 8 : size_of(col_type);
-
-    // align size for this type
-    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-    column_starts.push_back(fixed_width_size_per_row);
-    column_sizes.push_back(col_size);
-    fixed_width_size_per_row += col_size;
-  }
-
-  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
-  column_starts.push_back(validity_offset);
-
-  return fixed_width_size_per_row;
-}
+}  // namespace detail
 
 //#define DEBUG
-
-static std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
-                                                 std::vector<size_type> const &column_starts,
-                                                 std::vector<row_batch> const &row_batches,
-                                                 size_type const total_number_of_rows,
-                                                 size_type const &shmem_limit_per_block)
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr)
 {
-  std::vector<block_info> block_infos;
-
-  // block infos are organized with the windows going "down" the columns
-  // this provides the most coalescing of memory access
-  int current_window_width     = 0;
-  int current_window_start_col = 0;
-
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
-                        int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int i                        = 0;
-    while (i < total_number_of_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(detail::block_info{
-        start_col,
-        current_window_start_row,
-        end_col,
-        std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
-        current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
-  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-  // would be memory cache line sized access, but since other blocks will read/write the edges this
-  // may not turn out to be overly important. For now, we will attempt to build a square window as
-  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
-  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
-  // bytes, not rows or columns.
-  int const window_height = std::min(
-    std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows),
-    row_batches[0].row_count);
-#if defined(DEBUG)
-  printf(
-    "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height "
-    "%d\n",
-    size_type(sqrt(shmem_limit_per_block)) / column_sizes[0],
-    total_number_of_rows,
-    row_batches[0].row_count,
-    window_height);
-#endif
-
-  int row_size = 0;
-
-  // march each column and build the blocks of appropriate sizes
-  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
-    auto const col_size = column_sizes[col];
-
-    // align size for this type
-    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
-    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
-    auto row_size_with_this_col  = row_size_aligned + col_size;
-    auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
-
-    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
-#if defined(DEBUG)
-      printf(
-        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
-        "for shared mem size %d\n",
-        row_size_with_end_pad * window_height,
-        col,
-        row_size * window_height,
-        current_window_start_col,
-        col - 1,
-        window_height,
-        row_size_with_end_pad,
-        row_size,
-        row_size_aligned,
-        shmem_limit_per_block);
-#endif
-      // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
-      row_size =
-        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-#if defined(DEBUG)
-      printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
-        "or %d)\n",
-        row_size,
-        col_size,
-        row_size + col_size,
-        column_starts[col - 1],
-        column_sizes[col - 1],
-        column_starts[col - 1] + column_sizes[col - 1]);
-#endif
-      row_size += col_size;  // alignment required for shared memory window boundary to match
-                             // alignment of output row
-      current_window_start_col = col;
-      current_window_width     = 0;
-    } else {
-      row_size = row_size_with_this_col;
-      current_window_width++;
-    }
-  }
-
-  // build last set of blocks
-  if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height);
-  }
-
-  return block_infos;
-}
-}  // namespace detail
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough
+  // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes.
+  constexpr int max_window_height = 1024;
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows    = tbl.num_rows();
 
-#if defined(DEBUG)
-  void pretty_print(uint64_t i) {
+  #if defined(DEBUG)
+  auto pretty_print = [](uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
     } else if (i > (1 * 1024 * 1024)) {
@@ -1154,18 +667,8 @@ static std::vector<block_info> build_block_infos(std::vector<size_type> const &c
     } else {
       printf("%lu Bytes", i);
     }
-  }
-#endif
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr)
-{
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
-  const size_type num_columns = tbl.num_columns();
-  const size_type num_rows    = tbl.num_rows();
+  };
+  #endif
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
@@ -1173,12 +676,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-#if defined(DEBUG)
-  size_t free, total;
-  cudaMemGetInfo(&free, &total);
-  printf("%lu/%lu Memory\n", free, total);
-#endif
-
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
   // we want a single block to fill up the entire shared memory space available
@@ -1194,78 +691,50 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
-  // Get the pointers to the input columnar data ready
-  std::vector<const int8_t *> input_data;
-  std::vector<bitmask_type const *> input_nm;
-  input_data.reserve(num_columns);
-  input_nm.reserve(num_columns);
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv      = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
-
-  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-  auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
-
-  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
+  std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
+  std::vector<uint64_t> row_offsets; // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
-  std::vector<size_type> column_starts;  // offset of column inside a row including alignment
-  std::vector<column_view>
-    variable_width_columns;  // list of the variable width columns in the table
+  std::vector<size_type> column_starts; // offset of column inside a row including alignment
+  std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
   row_sizes.reserve(num_rows);
   row_offsets.reserve(num_rows);
   column_sizes.reserve(num_columns);
-  column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
-
-  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-    return std::make_tuple(tbl.column(i).type(), tbl.column(i));
-  });
-
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter,
-    iter + num_columns,
-    column_starts,
-    column_sizes);//,
-//    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
-  /*  size_type fixed_width_size_per_row = 0;
-    for (int col = 0; col < num_columns; ++col) {
-      auto cv          = tbl.column(col);
-      auto col_type    = cv.type();
-      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-      if (nested_type) { variable_width_columns.push_back(cv); }
-
-      // a list or string column will write a single uint64
-      // of data here for offset/length
-      auto col_size = nested_type ? 8 : size_of(col_type);
-
-      // align size for this type
-      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-      column_starts.push_back(fixed_width_size_per_row);
-      column_sizes.push_back(col_size);
-      fixed_width_size_per_row += col_size;
-    }*/
-
-#if defined(DEBUG)
-  printf("validity offset will be %d + %d = %d\n",
-         column_starts.back(),
-         column_sizes.back(),
-         column_starts.back() + column_sizes.back());
-#endif
-
-
-  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
-  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-
-  std::vector<detail::row_batch> row_batches;
+  column_starts.reserve(num_columns+1); // we add a final offset for validity data start
+
+  size_type fixed_width_size_per_row = 0;
+  for (int col = 0; col < num_columns; ++col) {
+    auto cv = tbl.column(col);
+    auto col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (nested_type) { variable_width_columns.push_back(cv);}
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed  = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row                  = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+  
+  // When building the columns to return, we have to be mindful of the offset limit in cudf.
+  // It is 32-bit and these data columns are capable of surpassing that easily. The data should
+  // not be cut off exactly at the limit though due to the validity buffers. The most efficient
+  // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+  // we keep track of the cut points for the validity, which we call row batches. If the row
+  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit.
+  // Note that this boundary is for our book-keeping with column pointers and not anything
+  // that the kernel needs to worry about. We cut the output at convienient boundaries
+  // when assembling the outgoing data stream.
+  struct row_batch {
+    size_type num_bytes;
+    size_type row_count;
+  };
+  std::vector<row_batch> row_batches;
 
   auto calculate_variable_width_row_data_size = [](int const row) {
     // each level of variable-width data will add an offset/length
@@ -1277,156 +746,210 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     // will be included in the variable-width data blob at the end of the
     // row.
     return 0;
-    /*      auto c = variable_width_columns[col];
-            while (true) {
-              auto col_offsets   = c.child(0).data<size_type>();
-              auto col_data_size = size_of(c.child(1).type());
-              std::size_t alignment_needed  = col_data_size;
-
-            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-            if (c.num_children() == 0) {
-              break;
-            }
-            c = c.child(1);
-          }
-    */
+/*      auto c = variable_width_columns[col];
+        while (true) {
+          auto col_offsets   = c.child(0).data<size_type>();
+          auto col_data_size = size_of(c.child(1).type());
+          std::size_t alignment_needed  = col_data_size;
+    
+        row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+        if (c.num_children() == 0) {
+          break;
+        }
+        c = c.child(1);
+      }
+*/
   };
 
   uint64_t row_batch_size   = 0;
   uint64_t total_table_size = 0;
-  size_type row_batch_rows  = 0;
-  uint64_t row_offset       = 0;
+  size_type row_batch_rows = 0;
+  uint64_t row_offset = 0;
 
-  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
-  // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = num_bitmask_words(num_columns) * 4;
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate
+  // the size of each row's variable-width data as well.
   for (int row = 0; row < num_rows; ++row) {
-    auto aligned_row_batch_size =
-      detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
-    row_sizes[row] = fixed_width_size_per_row;
-    // validity is byte aligned
-    row_sizes[row] += validity_size;
-    // variable width data is 8-byte aligned
-    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
-                     calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
-
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
-        (uint64_t)std::numeric_limits<size_type>::max()) {
+    row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row);
+    if (row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
-      row_batches.push_back(
-        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size         = 0;
-      row_batch_rows         = row_batch_rows & 31;
-      row_offset             = 0;
-      aligned_row_batch_size = 0;
+      row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size = 0;
+      row_batch_rows = row_batch_rows & 31;
+      row_offset = 0;
     }
-    row_offset = detail::align_offset(row_offset, 8);  // rows are 8 byte aligned
+    row_offset                  = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
     row_offsets.push_back(row_offset);
-    row_batch_size = aligned_row_batch_size + row_sizes[row];
+    row_batch_size += row_sizes[row];
     row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8);  // rows are 8 byte aligned
+    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
     total_table_size += row_sizes[row];
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
   }
 
-  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
-
-#if defined(DEBUG)
-  printf("%d rows and %d columns in table\n", num_rows, num_columns);
+  #if defined(DEBUG)
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
     printf("%d: %d rows, ", i, row_batches[i].row_count);
     pretty_print(row_batches[i].num_bytes);
     printf("\n");
   }
-#endif
+  #endif
 
-  std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
-    output_buffers.push_back(std::move(temp));
+  std::vector<detail::block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_size      = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+    int i = 0;
+    while (i < num_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(
+        detail::block_info{start_col,
+                   current_window_start_row,
+                   start_col + end_col,
+                   std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (int col = 0; col < num_columns; ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed  = col_size;  // They are the same for fixed width types
+    auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size;
+
+    if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row
+      current_window_start_col = col;
+    } else {
+      row_size = row_size_with_this_col;
+    }
   }
-  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
-  std::vector<detail::block_info> block_infos =
-    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+  auto validity_offset = detail::align_offset(column_starts.back(), 4);
+  column_starts.push_back(validity_offset);
+  
+  // build last set of blocks
+  if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); }
+
+  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things
+  std::vector<const int8_t *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
 
-#if defined(DEBUG)
-  printf("%lu windows for %d columns, %d rows to fit in ",
-         block_infos.size(),
-         block_infos[0].end_col - block_infos[0].start_col + 1,
-         block_infos[0].end_row - block_infos[0].start_row);
+  #if defined(DEBUG)
+  printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row);
   pretty_print(shmem_limit_per_block);
   printf(" shared mem(");
   pretty_print(fixed_width_size_per_row);
   printf("/row, %d columns, %d rows, ", num_columns, num_rows);
   pretty_print(total_table_size);
   printf(" total):\n");
-#endif
+  #endif
 
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts   = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  std::vector<rmm::device_buffer> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i=0; i<row_batches.size(); ++i) {
+    output_data.push_back(rmm::device_buffer(row_batches[i].num_bytes, stream, mr));
+  }
+  auto dev_output_data   = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks(block_infos.size());
-  #if defined(DEBUG) || 1
-  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size));
-  #else
-  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size));
-  #endif
-#if defined(DEBUG)
-  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  pretty_print(shmem_limit_per_block);
-  printf(" shared memory\n");
-#endif
-  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
-    num_rows,
-    num_columns,
-    dev_input_data.data(),
-    dev_input_nm.data(),
-    dev_col_sizes.data(),
-    dev_col_starts.data(),
-    dev_block_infos.data(),
-    dev_row_offsets.data(),
-    reinterpret_cast<int8_t **>(dev_output_data.data()));
+  dim3 blocks;
+  dim3 threads;
+  blocks.x  = block_infos.size();
+  blocks.y  = 0;
+  blocks.z  = 0;
+  threads.x = 1024;
+  threads.y = 0;
+  threads.z = 0;
+  detail::copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
+                                                                                num_columns,
+                                                                                dev_input_data.data(),
+                                                                                dev_input_nm.data(),
+                                                                                dev_col_sizes.data(),
+                                                                                dev_col_starts.data(),
+                                                                                dev_block_infos.data(),
+                                                                                dev_row_offsets.data(),
+                                                                                reinterpret_cast<int8_t **>(dev_output_data.data()));
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
   int offset_offset = 0;
   std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i = 0; i < row_batches.size(); ++i) {
+  for (uint i=0; i<row_batches.size(); ++i) {
+  
     // compute offsets for this row batch
     std::vector<size_type> offset_vals;
     offset_vals.reserve(row_batches[i].row_count + 1);
     size_type cur_offset = 0;
     offset_vals.push_back(cur_offset);
-    for (int row = 0; row < row_batches[i].row_count; ++row) {
-      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
+    for (int row=0; row<row_batches[i].row_count; ++row) {
+      cur_offset += row_sizes[row + offset_offset];
       offset_vals.push_back(cur_offset);
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
-    auto offsets     = std::make_unique<column>(
-      data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
+    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto offsets =
+      std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
-    auto data = std::make_unique<column>(
-      data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i]));
+    auto data =
+      std::make_unique<column>(data_type{cudf::type_id::INT8},
+                                row_batches[i].num_bytes,
+                                std::move(output_data[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
-                                          std::move(offsets),
-                                          std::move(data),
-                                          0,
-                                          rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-                                          stream,
-                                          mr));
+      std::move(offsets),
+      std::move(data),
+      0,
+      rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+      stream,
+      mr));
   }
-
+  
   return ret;
 }
 
@@ -1445,8 +968,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     std::vector<cudf::size_type> column_size;
 
     int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
-    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
+    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
 
     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
@@ -1463,16 +986,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
-    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
+    auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr);
+    auto dev_input_nm   = detail::copy_to_dev_async(input_nm, stream, mr);
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    zero->set_valid_async(true, stream);
+    zero->set_valid(true, stream);
     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
 
     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    step->set_valid_async(true, stream);
+    step->set_valid(true, stream);
     static_cast<ScalarType *>(step.get())
       ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
 
@@ -1500,100 +1023,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &input,
-                                                std::vector<cudf::data_type> const &schema,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource *mr)
-{
-  // verify that the types are what we expect
-  cudf::column_view child = input.child();
-  cudf::type_id list_type = child.type().id();
-  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-               "Only a list of bytes is supported as input");
-
-  cudf::size_type num_columns = schema.size();
-  cudf::size_type num_rows    = input.parent().size();
-
-  int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
-  int shmem_limit_per_block;
-  CUDA_TRY(
-    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
-
-  shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS;
-
-  std::vector<cudf::size_type> column_starts;
-  std::vector<cudf::size_type> column_sizes;
-
-  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
-    return std::make_tuple(schema[i], nullptr);
-  });
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
-
-  size_type validity_size = num_bitmask_words(num_columns) * 4;
-
-  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
-
-  // Ideally we would check that the offsets are all the same, etc. but for now
-  // this is probably fine
-  CUDF_EXPECTS(row_size * num_rows == child.size(),
-               "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
-
-  // build the row_batches from the passed in list column
-  std::vector<detail::row_batch> row_batches;
-
-  row_batches.push_back(detail::row_batch{child.size(), num_rows});
-
-  // Allocate the columns we are going to write into
-  std::vector<std::unique_ptr<cudf::column>> output_columns;
-  std::vector<int8_t *> output_data;
-  std::vector<cudf::bitmask_type *> output_nm;
-  for (cudf::size_type i = 0; i < num_columns; i++) {
-    auto column = cudf::make_fixed_width_column(
-      schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
-    auto mut = column->mutable_view();
-    output_data.emplace_back(mut.data<int8_t>());
-    output_nm.emplace_back(mut.null_mask());
-    output_columns.emplace_back(std::move(column));
-  }
-
-  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-  auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
-
-  std::vector<detail::block_info> block_infos =
-    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
-
-  dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
-  #if defined(DEBUG) || 1
-  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
-  #else
-  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size()));
-  #endif
-#if defined(DEBUG)
-  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  pretty_print(shmem_limit_per_block);
-  printf(" shared memory\n");
-#endif
-  detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
-    num_rows,
-    num_columns,
-    shmem_limit_per_block,
-    input.offsets().data<size_type>(),
-    dev_output_data.data(),
-    dev_output_nm.data(),
-    dev_col_sizes.data(),
-    dev_col_starts.data(),
-    dev_block_infos.data(),
-    child.data<int8_t>());
-
-  return std::make_unique<cudf::table>(std::move(output_columns));
-}
-
 std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
                                                std::vector<cudf::data_type> const &schema,
                                                rmm::cuda_stream_view stream,
@@ -1618,8 +1047,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     // this is probably fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = make_device_uvector_async(column_start, stream);
-    auto dev_column_size = make_device_uvector_async(column_size, stream);
+    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -1634,8 +1063,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       output_columns.emplace_back(std::move(column));
     }
 
-    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
+    auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr);
+    auto dev_output_nm   = detail::copy_to_dev_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
@@ -1646,10 +1075,10 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       num_rows,
       num_columns,
       size_per_row,
-      dev_column_start.data(),
-      dev_column_size.data(),
-      dev_output_data.data(),
-      dev_output_nm.data(),
+      dev_column_start->data(),
+      dev_column_size->data(),
+      dev_output_data->data(),
+      dev_output_nm->data(),
       child.data<int8_t>());
 
     return std::make_unique<cudf::table>(std::move(output_columns));
@@ -1674,20 +1103,4 @@ std::unique_ptr<cudf::table> convert_from_rows(
   //    }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows2(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
-{
-  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
-
-  //    for (uint i=0; i<input.size(); ++i) {
-  cudf::lists_column_view lcv = input[0]->view();
-  auto ret                    = convert_from_rows2(lcv, schema, stream, mr);
-
-  return ret;
-  //    }
-}
-
 }  // namespace cudf

From 7bcf41c94a30404b7145b1e32ea9ef77642ae787 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 10 Jun 2021 17:53:09 +0000
Subject: [PATCH 43/80] fixing kernel launch and updating

---
 .../row_conversion/row_conversion.cpp         |   9 +-
 cpp/src/row_conversion/row_conversion.cu      | 105 +++++++++++++-----
 2 files changed, 83 insertions(+), 31 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index c4edee91b3c..9fa05c408e5 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -28,7 +28,7 @@ class RowConversion : public cudf::benchmark {
 static void BM_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
+/*  auto const table = create_random_table({cudf::type_id::INT8,
                                           cudf::type_id::INT32,
                                           cudf::type_id::INT16,
                                           cudf::type_id::INT64,
@@ -38,7 +38,10 @@ static void BM_to_row(benchmark::State& state)
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
                                          50,
-                                         row_count{n_rows});
+                                         row_count{n_rows});*/
+  auto const table = create_random_table({cudf::type_id::INT32},
+  64,
+  row_count{n_rows});
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -98,7 +101,7 @@ static void BM_from_row(benchmark::State& state)
   (::benchmark::State & st) { BM_to_row(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)      \
     ->RangeMultiplier(8)                         \
-    ->Ranges({{1 << 16, 1 << 24}})               \
+    ->Ranges({{1 << 6, 1 << 20}})               \
     ->UseManualTime()                            \
     ->Unit(benchmark::kMillisecond);
 
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index fb5dc4cb38d..994233a0700 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <iostream>
 #include <iterator>
 #include <limits>
@@ -347,14 +348,14 @@ struct block_info {
  * @param output_data pointer to output data
  * 
  */
-__global__ void copy_from_columns(const cudf::size_type num_rows,
-                                  const cudf::size_type num_columns,
+__global__ void copy_from_columns(const size_type num_rows,
+                                  const size_type num_columns,
                                   const int8_t **input_data,
-                                  const cudf::bitmask_type **input_nm,
-                                  const cudf::size_type *col_sizes,
-                                  const cudf::size_type *col_offsets,
+                                  const bitmask_type **input_nm,
+                                  const size_type *col_sizes,
+                                  const size_type *col_offsets,
                                   const block_info *block_infos,
-                                  const uint64_t *row_offsets,
+                                  const size_type *row_offsets,
                                   int8_t **output_data)
 {
   // We are going to copy the data in two passes.
@@ -365,47 +366,92 @@ __global__ void copy_from_columns(const cudf::size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
+  bool debug_print = false;
+  
+  if (debug_print) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("Column Info:\n");
+    for (int i=0; i<num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n", i, input_data[i], col_sizes[i], col_offsets[i]);
+    }
+    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+/*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+      printf("%d: %d\n", i, row_offsets[i]);
+    }*/
+    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+  }
   auto block = block_infos[blockIdx.x];
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
   uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
-
-    printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x);
-
+  if (debug_print) {
+    printf("outputting to offset %lu\n", output_start_offset);
+    printf("dest shim offset is %d\n", dest_shim_offset);
+    printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
+  }
   // each thread is responsible for every threadcount rows of data.
   // the data is copies into shared memory in the final layout.
   auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
+  if (debug_print) {
+    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]);
+    printf("shmem row size %d\n", shmem_row_size);
+    printf("validity offset is %d\n", validity_offset);
+    printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row);
+  }
   for (int col=block.start_col; col<=block.end_col; ++col) {
     /*if (!col_is_variable) */{
       uint64_t col_offset = 0;
       cudf::size_type col_size = col_sizes[col];
       auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      if (debug_print) {
+        printf("dest col offset %d\n", dest_col_offset);
+      }
       for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
-        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * row];
+        if (debug_print) {
+          printf("shmem row %d at offset %d\n", row - block.start_row, (row - block.start_row) * shmem_row_size);
+        }
+        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
         switch (col_size) {
           case 1: {
-            *shmem_dest = input_data[col][row];
+            if (debug_print) {
+              printf("%p <- byte %d\n", shmem_dest, input_data[col][row]);
+            }
+              *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
             const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
+            if (debug_print) {
+              printf("%p <- short %d\n", shmem_dest, short_col_input[row]);
+            }
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
             const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            if (debug_print) {
+              printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]);
+            }
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
             const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
+            if (debug_print) {
+              printf("%p <- long %lu\n", shmem_dest, long_col_input[row]);
+            }
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
-            // TODO this should just not be supported for fixed width columns, but just in case...
+            if (debug_print) {
+                printf("byte for byte copy due to size %d\n", col_size);
+                printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]);
+              }
+              // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
             }
@@ -676,6 +722,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
+  #if defined(DEBUG)
+  size_t free, total;
+  cudaMemGetInfo( &free, &total );
+  printf("%lu/%lu Memory", free, total);
+  #endif
+
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
   // we want a single block to fill up the entire shared memory space available
@@ -692,7 +744,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // windows so the windows can be properly cut around them.
 
   std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
-  std::vector<uint64_t> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> row_offsets; // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
   std::vector<size_type> column_starts; // offset of column inside a row including alignment
   std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
@@ -821,7 +873,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
       block_infos.emplace_back(
         detail::block_info{start_col,
                    current_window_start_row,
-                   start_col + end_col,
+                   end_col,
                    std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
 
       i += window_height;
@@ -889,23 +941,20 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
   auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
-  std::vector<rmm::device_buffer> output_data;
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
   output_data.reserve(row_batches.size());
   for (uint i=0; i<row_batches.size(); ++i) {
-    output_data.push_back(rmm::device_buffer(row_batches[i].num_bytes, stream, mr));
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_output_data   = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks;
-  dim3 threads;
-  blocks.x  = block_infos.size();
-  blocks.y  = 0;
-  blocks.z  = 0;
-  threads.x = 1024;
-  threads.y = 0;
-  threads.z = 0;
-  detail::copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
+  dim3 blocks(block_infos.size());
+  dim3 threads(1024);
+  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
                                                                                 num_columns,
                                                                                 dev_input_data.data(),
                                                                                 dev_input_nm.data(),
@@ -932,14 +981,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);  
     auto offsets =
       std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
     auto data =
       std::make_unique<column>(data_type{cudf::type_id::INT8},
                                 row_batches[i].num_bytes,
-                                std::move(output_data[i]));
+                                std::move(output_buffers[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
       std::move(offsets),

From 17f1e5da99044036a2873e98905a10b7a5725adb Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 16 Jun 2021 19:25:57 +0000
Subject: [PATCH 44/80] Updates and bug fixing

---
 .../row_conversion/row_conversion.cpp         |  76 ++-
 cpp/src/row_conversion/row_conversion.cu      | 498 ++++++++++++------
 cpp/tests/row_conversion/row_conversion.cpp   | 106 ----
 3 files changed, 378 insertions(+), 302 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 9fa05c408e5..e1228c9df21 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -25,10 +25,43 @@
 class RowConversion : public cudf::benchmark {
 };
 
-static void BM_to_row(benchmark::State& state)
+static void BM_old_to_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-/*  auto const table = create_random_table({cudf::type_id::INT8,
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         212,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+    64,
+    row_count{n_rows});*/
+
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    total_bytes += cudf::size_of(t);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto rows = cudf::convert_to_rows(table->view());
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
+
+static void BM_new_to_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
                                           cudf::type_id::INT32,
                                           cudf::type_id::INT16,
                                           cudf::type_id::INT64,
@@ -37,11 +70,11 @@ static void BM_to_row(benchmark::State& state)
                                           cudf::type_id::UINT16,
                                           cudf::type_id::UINT8,
                                           cudf::type_id::UINT64},
-                                         50,
-                                         row_count{n_rows});*/
-  auto const table = create_random_table({cudf::type_id::INT32},
-  64,
-  row_count{n_rows});
+                                         212,
+                                         row_count{n_rows});
+  /*  auto const table = create_random_table({cudf::type_id::INT32},
+    64,
+    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -52,14 +85,13 @@ static void BM_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-//    auto rows = cudf::convert_to_rows(table->view());
     auto new_rows = cudf::convert_to_rows2(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-static void BM_from_row(benchmark::State& state)
+/*static void BM_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -73,9 +105,6 @@ static void BM_from_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          256,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-                                           4,
-                                           row_count{n_rows});*/
 
   std::vector<cudf::data_type> schema;
   cudf::size_type total_bytes = 0;
@@ -94,18 +123,19 @@ static void BM_from_row(benchmark::State& state)
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
-  BENCHMARK_DEFINE_F(RowConversion, name)        \
-  (::benchmark::State & st) { BM_to_row(st); }   \
-  BENCHMARK_REGISTER_F(RowConversion, name)      \
-    ->RangeMultiplier(8)                         \
-    ->Ranges({{1 << 6, 1 << 20}})               \
-    ->UseManualTime()                            \
+}*/
+
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
@@ -116,4 +146,4 @@ TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion)
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
+//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 994233a0700..92ba075c316 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -44,7 +44,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-
 /**
  * Copy a simple vector to device memory asynchronously. Be sure to read
  * the data on the same stream as is used to copy it.
@@ -61,10 +60,9 @@ std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &
 }
 
 template <typename T>
-rmm::device_uvector<T> copy_to_dev_async2(
-  const std::vector<T> &input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
+rmm::device_uvector<T> copy_to_dev_async2(const std::vector<T> &input,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource *mr)
 {
   rmm::device_uvector<T> ret(input.size(), stream, mr);
   CUDA_TRY(cudaMemcpyAsync(
@@ -346,7 +344,7 @@ struct block_info {
  * @param block_infos information about the blocks of work
  * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data
- * 
+ *
  */
 __global__ void copy_from_columns(const size_type num_rows,
                                   const size_type num_columns,
@@ -366,92 +364,119 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false;
-  
+  bool debug_print = false;  // blockIdx.x == 70 && threadIdx.x == 448;
+
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
     printf("Column Info:\n");
-    for (int i=0; i<num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n", i, input_data[i], col_sizes[i], col_offsets[i]);
+    for (int i = 0; i < num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n",
+             i,
+             input_data[i],
+             col_sizes[i],
+             col_offsets[i]);
     }
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-/*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-      printf("%d: %d\n", i, row_offsets[i]);
-    }*/
+    /*    printf("Row Offsets:\n");
+        for (int i=0; i<num_rows; ++i) {
+          printf("%d: %d\n", i, row_offsets[i]);
+        }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
-  auto block = block_infos[blockIdx.x];
+  auto block               = block_infos[blockIdx.x];
+  auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
   uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
-  uint8_t const dest_shim_offset = reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest
+  uint8_t const dest_shim_offset =
+    reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) &
+    7;  // offset for alignment shim in order to match shared memory with final dest
   if (debug_print) {
     printf("outputting to offset %lu\n", output_start_offset);
     printf("dest shim offset is %d\n", dest_shim_offset);
     printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
+    printf("my block is %d,%d -> %d,%d - buffer %d\n",
+           block.start_col,
+           block.start_row,
+           block.end_col,
+           block.end_row,
+           block.buffer_num);
   }
   // each thread is responsible for every threadcount rows of data.
   // the data is copies into shared memory in the final layout.
-  auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows
+  auto const real_bytes_in_row =
+    col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
+  auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
+                                           8);  // 8 byte alignment required for shared memory rows
   auto const validity_offset = col_offsets[num_columns];
   if (debug_print) {
-    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]);
+    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n",
+           block.end_col,
+           col_offsets[block.end_col],
+           block.end_col,
+           col_sizes[block.end_col],
+           block.start_col,
+           col_offsets[block.start_col]);
     printf("shmem row size %d\n", shmem_row_size);
     printf("validity offset is %d\n", validity_offset);
-    printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row);
+    printf("starting at %d,%d and going to %d, %d\n",
+           block.start_col,
+           block.start_row,
+           block.end_col,
+           block.end_row);
   }
-  for (int col=block.start_col; col<=block.end_col; ++col) {
-    /*if (!col_is_variable) */{
-      uint64_t col_offset = 0;
+  for (int col = block.start_col; col <= block.end_col; ++col) {
+    /*if (!col_is_variable) */ {
+      uint64_t col_offset      = 0;
       cudf::size_type col_size = col_sizes[col];
-      auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
-      if (debug_print) {
-        printf("dest col offset %d\n", dest_col_offset);
-      }
-      for (int row=block.start_row + threadIdx.x; row<block.end_row; row+=gridDim.x) {
+      auto const dest_col_offset =
+        col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
+      if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
+      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) {
         if (debug_print) {
-          printf("shmem row %d at offset %d\n", row - block.start_row, (row - block.start_row) * shmem_row_size);
+          printf("shmem row %d(%d) at offset %d(%d)\n",
+                 row - block.start_row,
+                 row,
+                 (row - block.start_row) * shmem_row_size,
+                 row * shmem_row_size);
         }
-        int8_t *shmem_dest = &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
+        int8_t *shmem_dest =
+          &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
         switch (col_size) {
           case 1: {
-            if (debug_print) {
-              printf("%p <- byte %d\n", shmem_dest, input_data[col][row]);
-            }
-              *shmem_dest = input_data[col][row];
+            if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); }
+            *shmem_dest = input_data[col][row];
             break;
           }
           case 2: {
-            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(input_data[col]);
-            if (debug_print) {
-              printf("%p <- short %d\n", shmem_dest, short_col_input[row]);
-            }
+            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_data[col]);
+            if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); }
             *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
             break;
           }
           case 4: {
-            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(input_data[col]);
+            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_data[col]);
             if (debug_print) {
-              printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]);
+              printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]);
             }
             *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
             break;
           }
           case 8: {
-            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_data[col]);
-            if (debug_print) {
-              printf("%p <- long %lu\n", shmem_dest, long_col_input[row]);
-            }
+            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_data[col]);
+            if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); }
             *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
             break;
           }
           default: {
             cudf::size_type input_offset = col_size * row;
             if (debug_print) {
-                printf("byte for byte copy due to size %d\n", col_size);
-                printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]);
-              }
-              // TODO this should just not be supported for fixed width columns, but just in case...
+              printf("byte for byte copy due to size %d of column %d\n", col_size, col);
+              printf("%p <- input_data[%d] which is %d\n",
+                     shmem_dest,
+                     input_offset,
+                     input_data[col][input_offset]);
+            }
+            // TODO this should just not be supported for fixed width columns, but just in case...
             for (cudf::size_type b = 0; b < col_size; b++) {
               shmem_dest[b] = input_data[col][b + input_offset];
             }
@@ -463,11 +488,13 @@ __global__ void copy_from_columns(const size_type num_rows,
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
-        int8_t *valid_byte              = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
+        int8_t *valid_byte =
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
         cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
+        if (debug_print) { printf("Outputting validity to %p\n", valid_byte); }
         // Now copy validity for the column
         if (input_nm[col]) {
           if (bit_is_set(input_nm[col], row)) {
@@ -479,11 +506,11 @@ __global__ void copy_from_columns(const size_type num_rows,
           // It is valid so just set the bit
           atomicOr_block(valid_int, 1 << int_bit_offset);
         }
-      } // end row
+      }  // end row
 
-      col_offset += col_sizes[col] * (block.end_row - block.start_row);
+      col_offset += col_sizes[col] * rows_in_block;
     }
-  } // end col
+  }  // end col
 
   // wait for the data to be totally copied into shared memory
   __syncthreads();
@@ -496,30 +523,75 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride = gridDim.x * 8;
-  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) {
+  auto const thread_stride       = gridDim.x * 8;
+  if (debug_print) {
+    printf("writing final data from %d to %d at stride %d\n",
+           thread_start_offset,
+           shmem_row_size * rows_in_block,
+           thread_stride);
+    printf("rows in block %d\n", rows_in_block);
+  }
+  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block;
+       src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
-    auto const row_offset = row_offsets[block.start_row + output_row_num];
-    auto const col_offset = src_offset % shmem_row_size;
-    int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset];
-    int8_t *input_ptr = &shared_data[src_offset];
-    // the first part and last part of the row is unaligned data copy. This is copied a single byte
-    // at a time.
-    if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
-      // first part of a row, copy single bytes
+    auto const row_offset     = row_offsets[block.start_row + output_row_num];
+    auto const col_offset     = src_offset % shmem_row_size;
+    int8_t *output_ptr        = &output_data[block.buffer_num][row_offset + col_offset];
+    int8_t *input_ptr         = &shared_data[src_offset];
+
+    // three cases to worry about here
+    // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front
+    // 2) last 8-byte part of a large row - some bytes of pad at the end
+    // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front
+    // AND potentially pad at the rear
+
+    // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily.
+    // 1st case is when we're at some even multiple of shmem_row_size offset.
+    // 2nd case is when offset + 8 is some even multiple of shmem_row_size.
+    // must be an 8 byte copy
+
+    // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize?
+    if (real_bytes_in_row + dest_shim_offset <= 8) {
+      // case 3, we want to copy real_bytes_in_row bytes
+      auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf("case 3 - %d single byte final write %p -> %p\n",
+                 num_single_bytes,
+                 &input_ptr[i + dest_shim_offset],
+                 &output_ptr[i]);
+        }
+        output_ptr[i] = input_ptr[i + dest_shim_offset];
+      }
+    } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
+      // first byte with leading pad
       auto const num_single_bytes = 8 - dest_shim_offset;
-      for (auto i=0; i<num_single_bytes; ++i) {
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf(
+            "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]);
+        }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
-    } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) {
-      // last part of a row, copy single bytes
-      auto const num_single_bytes = dest_shim_offset;
-      for (auto i=0; i<num_single_bytes; ++i) {
+    } else if ((src_offset + 8) % shmem_row_size == 0 &&
+               (real_bytes_in_row + dest_shim_offset) % 8 > 0) {
+      // last bytes of a row
+      auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8;
+      for (auto i = 0; i < num_single_bytes; ++i) {
+        if (debug_print) {
+          printf("single trailing byte final write %p -> %p\n",
+                 &input_ptr[i + dest_shim_offset],
+                 &output_ptr[i]);
+        }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
       }
     } else {
       // copy 8 bytes aligned
-      const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(input_ptr);
+      const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_ptr);
+      if (debug_print) {
+        printf(
+          "long final write %p -> %p\n", long_col_input, reinterpret_cast<int64_t *>(output_ptr));
+      }
       *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
     }
   }
@@ -696,13 +768,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
                                                             rmm::cuda_stream_view stream,
                                                             rmm::mr::device_memory_resource *mr)
 {
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough
-  // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes.
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
   constexpr int max_window_height = 1024;
-  const size_type num_columns = tbl.num_columns();
-  const size_type num_rows    = tbl.num_rows();
+  const size_type num_columns     = tbl.num_columns();
+  const size_type num_rows        = tbl.num_rows();
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   auto pretty_print = [](uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
@@ -714,7 +787,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
       printf("%lu Bytes", i);
     }
   };
-  #endif
+#endif
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
@@ -722,11 +795,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   size_t free, total;
-  cudaMemGetInfo( &free, &total );
-  printf("%lu/%lu Memory", free, total);
-  #endif
+  cudaMemGetInfo(&free, &total);
+  printf("%lu/%lu Memory\n", free, total);
+#endif
 
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
@@ -743,45 +816,46 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
-  std::vector<size_type> row_sizes; // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets; // offset from the start of the data to this row
+  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
+  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
-  std::vector<size_type> column_starts; // offset of column inside a row including alignment
-  std::vector<column_view> variable_width_columns; // list of the variable width columns in the table
+  std::vector<size_type> column_starts;  // offset of column inside a row including alignment
+  std::vector<column_view>
+    variable_width_columns;  // list of the variable width columns in the table
   row_sizes.reserve(num_rows);
   row_offsets.reserve(num_rows);
   column_sizes.reserve(num_columns);
-  column_starts.reserve(num_columns+1); // we add a final offset for validity data start
+  column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
   size_type fixed_width_size_per_row = 0;
   for (int col = 0; col < num_columns; ++col) {
-    auto cv = tbl.column(col);
-    auto col_type = cv.type();
+    auto cv          = tbl.column(col);
+    auto col_type    = cv.type();
     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
-    if (nested_type) { variable_width_columns.push_back(cv);}
+    if (nested_type) { variable_width_columns.push_back(cv); }
 
     // a list or string column will write a single uint64
     // of data here for offset/length
     auto col_size = nested_type ? 8 : size_of(col_type);
 
     // align size for this type
-    std::size_t const alignment_needed  = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row                  = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
     column_starts.push_back(fixed_width_size_per_row);
     column_sizes.push_back(col_size);
     fixed_width_size_per_row += col_size;
   }
-  
+
   // When building the columns to return, we have to be mindful of the offset limit in cudf.
   // It is 32-bit and these data columns are capable of surpassing that easily. The data should
   // not be cut off exactly at the limit though due to the validity buffers. The most efficient
   // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
   // we keep track of the cut points for the validity, which we call row batches. If the row
-  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit.
-  // Note that this boundary is for our book-keeping with column pointers and not anything
-  // that the kernel needs to worry about. We cut the output at convienient boundaries
-  // when assembling the outgoing data stream.
+  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+  // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+  // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+  // the outgoing data stream.
   struct row_batch {
     size_type num_bytes;
     size_type row_count;
@@ -798,71 +872,90 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     // will be included in the variable-width data blob at the end of the
     // row.
     return 0;
-/*      auto c = variable_width_columns[col];
-        while (true) {
-          auto col_offsets   = c.child(0).data<size_type>();
-          auto col_data_size = size_of(c.child(1).type());
-          std::size_t alignment_needed  = col_data_size;
-    
-        row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-        if (c.num_children() == 0) {
-          break;
-        }
-        c = c.child(1);
-      }
-*/
+    /*      auto c = variable_width_columns[col];
+            while (true) {
+              auto col_offsets   = c.child(0).data<size_type>();
+              auto col_data_size = size_of(c.child(1).type());
+              std::size_t alignment_needed  = col_data_size;
+
+            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+            if (c.num_children() == 0) {
+              break;
+            }
+            c = c.child(1);
+          }
+    */
   };
 
   uint64_t row_batch_size   = 0;
   uint64_t total_table_size = 0;
-  size_type row_batch_rows = 0;
-  uint64_t row_offset = 0;
+  size_type row_batch_rows  = 0;
+  uint64_t row_offset       = 0;
+
+  auto calculate_validity_size = [](int const num_cols) {
+    // Now we need to add in space for validity
+    // Eventually we can think about nullable vs not nullable, but for now we will just always add
+    // it in
+    return (num_cols + 7) / 8;
+  };
 
-  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate
-  // the size of each row's variable-width data as well.
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
+  // calculate the size of each row's variable-width data and validity as well.
   for (int row = 0; row < num_rows; ++row) {
-    row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row);
-    if (row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+    auto aligned_row_batch_size =
+      detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
+    row_sizes[row] = fixed_width_size_per_row;
+    // validity is byte aligned
+    row_sizes[row] += calculate_validity_size(num_columns);
+    // variable width data is 8-byte aligned
+    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
+                     calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
+
+    if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
-      row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size = 0;
-      row_batch_rows = row_batch_rows & 31;
-      row_offset = 0;
+      row_batches.push_back(
+        row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size         = 0;
+      row_batch_rows         = row_batch_rows & 31;
+      row_offset             = 0;
+      aligned_row_batch_size = 0;
     }
-    row_offset                  = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+    row_offset = detail::align_offset(row_offset, 8);  // rows are 8 byte aligned
     row_offsets.push_back(row_offset);
-    row_batch_size += row_sizes[row];
+    row_batch_size = aligned_row_batch_size + row_sizes[row];
     row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+    total_table_size = detail::align_offset(total_table_size, 8);  // rows are 8 byte aligned
     total_table_size += row_sizes[row];
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
+  printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
     printf("%d: %d rows, ", i, row_batches[i].row_count);
     pretty_print(row_batches[i].num_bytes);
     printf("\n");
   }
-  #endif
+#endif
 
   std::vector<detail::block_info> block_infos;
 
   // block infos are organized with the windows going "down" the columns
   // this provides the most coalescing of memory access
-  int current_window_size      = 0;
+  int current_window_width     = 0;
   int current_window_start_col = 0;
 
   // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) {
+  auto build_blocks = [&block_infos, &row_batches, num_rows](
+                        int const start_col, int const end_col, int const desired_window_height) {
     int current_window_start_row = 0;
     int current_window_row_batch = 0;
-    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-    int i = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int i                        = 0;
     while (i < num_rows) {
       if (rows_left_in_batch == 0) {
         current_window_row_batch++;
@@ -872,9 +965,10 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
       block_infos.emplace_back(
         detail::block_info{start_col,
-                   current_window_start_row,
-                   end_col,
-                   std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch});
+                           current_window_start_row,
+                           end_col,
+                           std::min(current_window_start_row + window_height - 1, num_rows - 1),
+                           current_window_row_batch});
 
       i += window_height;
       current_window_start_row += window_height;
@@ -882,7 +976,17 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   };
 
-  int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+  int const window_height =
+    std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+#if defined(DEBUG)
+  printf(
+    "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
+    "%d\n",
+    max_window_height,
+    num_rows,
+    row_batches[0].row_count,
+    window_height);
+#endif
 
   int row_size = 0;
 
@@ -891,32 +995,74 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     auto const col_size = column_sizes[col];
 
     // align size for this type
-    std::size_t alignment_needed  = col_size;  // They are the same for fixed width types
-    auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size;
+    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
+    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col  = row_size_aligned + col_size;
 
     if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+#if defined(DEBUG)
+      printf(
+        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "for shared mem size %d\n",
+        row_size_with_this_col * window_height,
+        col,
+        row_size * window_height,
+        current_window_start_col,
+        col - 1,
+        window_height,
+        row_size_with_this_col,
+        row_size,
+        row_size_aligned,
+        shmem_limit_per_block);
+#endif
       // too large, close this window, generate vertical blocks and restart
       build_blocks(current_window_start_col, col - 1, window_height);
-      row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row
+      row_size =
+        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+#if defined(DEBUG)
+      printf(
+        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "or %d)\n",
+        row_size,
+        col_size,
+        row_size + col_size,
+        column_starts[col - 1],
+        column_sizes[col - 1],
+        column_starts[col - 1] + column_sizes[col - 1]);
+#endif
+      row_size += col_size;  // alignment required for shared memory window boundary to match
+                             // alignment of output row
       current_window_start_col = col;
+      current_window_width     = 0;
     } else {
       row_size = row_size_with_this_col;
+      current_window_width++;
     }
   }
 
-  auto validity_offset = detail::align_offset(column_starts.back(), 4);
+#if defined(DEBUG)
+  printf("validity offset will be %d + %d = %d\n",
+         column_starts.back(),
+         column_sizes.back(),
+         column_starts.back() + column_sizes.back());
+#endif
+  auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4);
   column_starts.push_back(validity_offset);
-  
+
   // build last set of blocks
-  if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); }
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
+  }
 
-  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things
+  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while
+  // calculating other things
   std::vector<const int8_t *> input_data;
   std::vector<bitmask_type const *> input_nm;
   for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv = tbl.column(column_number);
+    column_view cv      = tbl.column(column_number);
     auto const col_type = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
     if (!nested_type) {
       input_data.emplace_back(cv.data<int8_t>());
@@ -924,81 +1070,87 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  #if defined(DEBUG)
-  printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row);
+#if defined(DEBUG)
+  printf("%lu windows for %d columns, %d rows to fit in ",
+         block_infos.size(),
+         block_infos[0].end_col - block_infos[0].start_col + 1,
+         block_infos[0].end_row - block_infos[0].start_row);
   pretty_print(shmem_limit_per_block);
   printf(" shared mem(");
   pretty_print(fixed_width_size_per_row);
   printf("/row, %d columns, %d rows, ", num_columns, num_rows);
   pretty_print(total_table_size);
   printf(" total):\n");
-  #endif
+#endif
 
   auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
   auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
   auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts   = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
-  auto dev_row_offsets   = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
   std::vector<rmm::device_buffer> output_buffers;
   std::vector<int8_t *> output_data;
   output_data.reserve(row_batches.size());
-  for (uint i=0; i<row_batches.size(); ++i) {
+  for (uint i = 0; i < row_batches.size(); ++i) {
     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data   = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(1024);
-  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(num_rows,
-                                                                                num_columns,
-                                                                                dev_input_data.data(),
-                                                                                dev_input_nm.data(),
-                                                                                dev_col_sizes.data(),
-                                                                                dev_col_starts.data(),
-                                                                                dev_block_infos.data(),
-                                                                                dev_row_offsets.data(),
-                                                                                reinterpret_cast<int8_t **>(dev_output_data.data()));
+  dim3 threads(std::min((uint64_t)1024, total_table_size / 8));
+#if defined(DEBUG)
+  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared memory\n");
+#endif
+  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+    num_rows,
+    num_columns,
+    dev_input_data.data(),
+    dev_input_nm.data(),
+    dev_col_sizes.data(),
+    dev_col_starts.data(),
+    dev_block_infos.data(),
+    dev_row_offsets.data(),
+    reinterpret_cast<int8_t **>(dev_output_data.data()));
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
   int offset_offset = 0;
   std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i=0; i<row_batches.size(); ++i) {
-  
+  for (uint i = 0; i < row_batches.size(); ++i) {
     // compute offsets for this row batch
     std::vector<size_type> offset_vals;
     offset_vals.reserve(row_batches[i].row_count + 1);
     size_type cur_offset = 0;
     offset_vals.push_back(cur_offset);
-    for (int row=0; row<row_batches[i].row_count; ++row) {
-      cur_offset += row_sizes[row + offset_offset];
+    for (int row = 0; row < row_batches[i].row_count; ++row) {
+      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
       offset_vals.push_back(cur_offset);
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets   = detail::copy_to_dev_async2(offset_vals, stream, mr);  
-    auto offsets =
-      std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
+    auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto offsets     = std::make_unique<column>(
+      data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
-    auto data =
-      std::make_unique<column>(data_type{cudf::type_id::INT8},
-                                row_batches[i].num_bytes,
-                                std::move(output_buffers[i]));
+    auto data = std::make_unique<column>(
+      data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i]));
 
     ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
-      std::move(offsets),
-      std::move(data),
-      0,
-      rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-      stream,
-      mr));
+                                          std::move(offsets),
+                                          std::move(data),
+                                          0,
+                                          rmm::device_buffer{0, rmm::cuda_stream_default, mr},
+                                          stream,
+                                          mr));
   }
-  
+
   return ret;
 }
 
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 818d7a89ddb..c02f83ad1d5 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -21,13 +21,9 @@
 
 #include <thrust/iterator/counting_iterator.h>
 #include <cudf/row_conversion.hpp>
-#include "cudf/lists/lists_column_view.hpp"
-#include "cudf/types.hpp"
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {
 };
-struct RowToColumnTests : public cudf::test::BaseFixture {
-};
 
 TEST_F(ColumnToRowTests, Single)
 {
@@ -112,105 +108,3 @@ TEST_F(ColumnToRowTests, SingleByteWide)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
 }
-
-TEST_F(RowToColumnTests, Single)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Simple)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Tall)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Wide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, SingleByteWide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}

From dfda0f3d9ec9cc7dff63372b8b46b240f7294d53 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 21 Jun 2021 18:17:45 +0000
Subject: [PATCH 45/80] Updating windows to be generated in a square way so we
 can have more data to write out as 8-byte writes from shared memory. Shuffled
 some of the copy to GPU code up so it can start the copy sooner and hopefully
 won't force stalls. Some bug fixes.

---
 .../row_conversion/row_conversion.cpp         | 15 ++-
 cpp/src/row_conversion/row_conversion.cu      | 96 +++++++++++--------
 2 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index e1228c9df21..d6b195433cf 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -125,7 +125,7 @@ static void BM_new_to_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }*/
 
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
   BENCHMARK_DEFINE_F(RowConversion, name)           \
   (::benchmark::State & st) { f(st); }              \
   BENCHMARK_REGISTER_F(RowConversion, name)         \
@@ -134,8 +134,17 @@ static void BM_new_to_row(benchmark::State& state)
     ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
+    ->Unit(benchmark::kMillisecond);
+
+OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 92ba075c316..3f221e2f716 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -364,7 +364,7 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false;  // blockIdx.x == 70 && threadIdx.x == 448;
+  constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -383,6 +383,7 @@ __global__ void copy_from_columns(const size_type num_rows,
         }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
+  //else { return; }
   auto block               = block_infos[blockIdx.x];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
@@ -416,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            col_sizes[block.end_col],
            block.start_col,
            col_offsets[block.start_col]);
-    printf("shmem row size %d\n", shmem_row_size);
+    printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row);
     printf("validity offset is %d\n", validity_offset);
     printf("starting at %d,%d and going to %d, %d\n",
            block.start_col,
@@ -524,6 +525,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   //
   auto const thread_start_offset = threadIdx.x * 8;
   auto const thread_stride       = gridDim.x * 8;
+  auto const end_offset = shmem_row_size * rows_in_block;
+
   if (debug_print) {
     printf("writing final data from %d to %d at stride %d\n",
            thread_start_offset,
@@ -531,7 +534,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            thread_stride);
     printf("rows in block %d\n", rows_in_block);
   }
-  for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block;
+  for (auto src_offset = thread_start_offset; src_offset < end_offset;
        src_offset += thread_stride) {
     auto const output_row_num = src_offset / shmem_row_size;
     auto const row_offset     = row_offsets[block.start_row + output_row_num];
@@ -771,7 +774,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
   // Potential optimization for window sizes.
-  constexpr int max_window_height = 1024;
   const size_type num_columns     = tbl.num_columns();
   const size_type num_rows        = tbl.num_rows();
 
@@ -816,6 +818,25 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // to that point. These are row batches and they are decided first before building the
   // windows so the windows can be properly cut around them.
 
+  // Get the pointers to the input columnar data ready
+  std::vector<const int8_t *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  input_data.reserve(num_columns);
+  input_nm.reserve(num_columns);
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv      = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
+
+  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
@@ -847,6 +868,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     fixed_width_size_per_row += col_size;
   }
 
+  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
+
   // When building the columns to return, we have to be mindful of the offset limit in cudf.
   // It is 32-bit and these data columns are capable of surpassing that easily. The data should
   // not be cut off exactly at the limit though due to the validity buffers. The most efficient
@@ -901,17 +925,18 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
+  auto validity_size = calculate_validity_size(num_columns);
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
     row_sizes[row] = fixed_width_size_per_row;
     // validity is byte aligned
-    row_sizes[row] += calculate_validity_size(num_columns);
+    row_sizes[row] += validity_size;
     // variable width data is 8-byte aligned
     row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
                      calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
 
-    if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits<size_type>::max()) {
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
       row_batches.push_back(
         row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
@@ -932,7 +957,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-#if defined(DEBUG)
+  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
+
+  #if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
@@ -942,6 +969,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
 #endif
 
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i = 0; i < row_batches.size(); ++i) {
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
+  }
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+
   std::vector<detail::block_info> block_infos;
 
   // block infos are organized with the windows going "down" the columns
@@ -976,8 +1013,13 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   };
 
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized
+  // access, but since other blocks will read/write the edges this may not turn out to be overly important.
+  // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size.
+  // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are
+  // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns.
   int const window_height =
-    std::min(std::min(max_window_height, num_rows), row_batches[0].row_count);
+    std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count);
 #if defined(DEBUG)
   printf(
     "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
@@ -998,20 +1040,21 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     std::size_t alignment_needed = col_size;  // They are the same for fixed width types
     auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
     auto row_size_with_this_col  = row_size_aligned + col_size;
+    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
 
-    if (row_size_with_this_col * window_height > shmem_limit_per_block) {
+    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
 #if defined(DEBUG)
       printf(
         "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
         "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
         "for shared mem size %d\n",
-        row_size_with_this_col * window_height,
+        row_size_with_end_pad * window_height,
         col,
         row_size * window_height,
         current_window_start_col,
         col - 1,
         window_height,
-        row_size_with_this_col,
+        row_size_with_end_pad,
         row_size,
         row_size_aligned,
         shmem_limit_per_block);
@@ -1055,20 +1098,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
   }
 
-  // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while
-  // calculating other things
-  std::vector<const int8_t *> input_data;
-  std::vector<bitmask_type const *> input_nm;
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv      = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
 
 #if defined(DEBUG)
   printf("%lu windows for %d columns, %d rows to fit in ",
@@ -1083,26 +1112,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf(" total):\n");
 #endif
 
-  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
-  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
   auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
-  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
-
-  std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
-    output_buffers.push_back(std::move(temp));
-  }
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(std::min((uint64_t)1024, total_table_size / 8));
+  dim3 threads(std::min(1024, shmem_limit_per_block / 8));
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
   pretty_print(shmem_limit_per_block);

From 5c0e52ce20e0708025917552646f3aa48d312b1a Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 8 Jul 2021 01:52:36 +0000
Subject: [PATCH 46/80] Adding row to column conversion code. Performance falls
 off a cliff, but starts out reasonably. I haven't looked at this in nsight
 yet.

---
 .../row_conversion/row_conversion.cpp         |  74 +-
 cpp/include/cudf/row_conversion.hpp           |  12 +
 cpp/src/row_conversion/row_conversion.cu      | 759 +++++++++++++-----
 cpp/tests/row_conversion/row_conversion.cpp   | 106 +++
 4 files changed, 748 insertions(+), 203 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index d6b195433cf..7c1f52c5cd6 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -91,7 +91,7 @@ static void BM_new_to_row(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
-/*static void BM_from_row(benchmark::State& state)
+static void BM_old_from_row(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const table = create_random_table({cudf::type_id::INT8,
@@ -123,36 +123,62 @@ static void BM_new_to_row(benchmark::State& state)
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}*/
-
-#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
-    ->Unit(benchmark::kMillisecond);
+}
+
+static void BM_new_from_row(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const table = create_random_table({cudf::type_id::INT8,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::INT16,
+                                          cudf::type_id::INT64,
+                                          cudf::type_id::INT32,
+                                          cudf::type_id::BOOL8,
+                                          cudf::type_id::UINT16,
+                                          cudf::type_id::UINT8,
+                                          cudf::type_id::UINT64},
+                                         256,
+                                         row_count{n_rows});
+
+  std::vector<cudf::data_type> schema;
+  cudf::size_type total_bytes = 0;
+  for (int i = 0; i < table->num_columns(); ++i) {
+    auto t = table->get_column(i).type();
+    schema.push_back(t);
+    total_bytes += cudf::size_of(t);
+  }
+
+  auto rows = cudf::convert_to_rows(table->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto out = cudf::convert_from_rows2(rows, schema);
+  }
+
+  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
+}
 
-#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
+#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
+  BENCHMARK_DEFINE_F(RowConversion, name)               \
+  (::benchmark::State & st) { f(st); }                  \
+  BENCHMARK_REGISTER_F(RowConversion, name)             \
+    ->RangeMultiplier(8)                                \
+    ->Ranges({{1 << 6, 1 << 20}})                       \
+    ->UseManualTime()                                   \
     ->Unit(benchmark::kMillisecond);
 
-OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
+TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
-#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \
+#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
   BENCHMARK_DEFINE_F(RowConversion, name)          \
-  (::benchmark::State & st) { BM_from_row(st); }   \
+  (::benchmark::State & st) { f(st); }   \
   BENCHMARK_REGISTER_F(RowConversion, name)        \
     ->RangeMultiplier(8)                           \
-    ->Ranges({{1 << 6, 1 << 22}})                  \
+    ->Ranges({{1 << 6, 1 << 20}})                  \
     ->UseManualTime()                              \
     ->Unit(benchmark::kMillisecond);
 
-//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
+FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index f5e2225ad19..282ffa4b0cb 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -48,4 +48,16 @@ std::unique_ptr<cudf::table> convert_from_rows(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::table> convert_from_rows2(
+  cudf::lists_column_view const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::table> convert_from_rows2(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 }  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 3f221e2f716..c0e78a03576 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -30,6 +30,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cudf/row_conversion.hpp>
+#include <tuple>
 #include "cudf/types.hpp"
 #include "rmm/device_buffer.hpp"
 #include "thrust/iterator/counting_iterator.h"
@@ -332,6 +333,20 @@ struct block_info {
   int buffer_num;
 };
 
+// When building the columns to return, we have to be mindful of the offset limit in cudf.
+// It is 32-bit and these data columns are capable of surpassing that easily. The data should
+// not be cut off exactly at the limit though due to the validity buffers. The most efficient
+// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+// we keep track of the cut points for the validity, which we call row batches. If the row
+// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+// the outgoing data stream.
+struct row_batch {
+  size_type num_bytes;
+  size_type row_count;
+};
+
 /**
  * @brief copy data from cudf columns into x format, which is row-based
  *
@@ -364,7 +379,7 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479;
+  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -383,7 +398,7 @@ __global__ void copy_from_columns(const size_type num_rows,
         }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
-  //else { return; }
+  // else { return; }
   auto block               = block_infos[blockIdx.x];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   extern __shared__ int8_t shared_data[];
@@ -403,7 +418,7 @@ __global__ void copy_from_columns(const size_type num_rows,
            block.buffer_num);
   }
   // each thread is responsible for every threadcount rows of data.
-  // the data is copies into shared memory in the final layout.
+  // the data is copied into shared memory in the final layout.
   auto const real_bytes_in_row =
     col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
   auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
@@ -432,7 +447,7 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const dest_col_offset =
         col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
       if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
-      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) {
+      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
         if (debug_print) {
           printf("shmem row %d(%d) at offset %d(%d)\n",
                  row - block.start_row,
@@ -524,8 +539,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   // row in shared memory may not be an entire row of the destination.
   //
   auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride       = gridDim.x * 8;
-  auto const end_offset = shmem_row_size * rows_in_block;
+  auto const thread_stride       = blockDim.x * 8;
+  auto const end_offset          = shmem_row_size * rows_in_block;
 
   if (debug_print) {
     printf("writing final data from %d to %d at stride %d\n",
@@ -559,9 +574,10 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
       for (auto i = 0; i < num_single_bytes; ++i) {
         if (debug_print) {
-          printf("case 3 - %d single byte final write %p -> %p\n",
+          printf("case 3 - %d single byte final write %p(%d) -> %p\n",
                  num_single_bytes,
                  &input_ptr[i + dest_shim_offset],
+                 input_ptr[i + dest_shim_offset],
                  &output_ptr[i]);
         }
         output_ptr[i] = input_ptr[i + dest_shim_offset];
@@ -600,6 +616,237 @@ __global__ void copy_from_columns(const size_type num_rows,
   }
 }
 
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param offsets
+ * @param output_data
+ * @param output_nm
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_to_columns(const size_type num_rows,
+                                const size_type num_columns,
+                                const size_type *offsets,
+                                int8_t **output_data,
+                                cudf::bitmask_type **output_nm,
+                                const size_type *col_sizes,
+                                const size_type *col_offsets,
+                                const block_info *block_infos,
+                                const int8_t *input_data)
+{
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0;
+
+  if (debug_print) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("Column Info:\n");
+    for (int i = 0; i < num_columns; ++i) {
+      printf("col %d is at %p with size %d and offset %d\n",
+             i,
+             output_data[i],
+             col_sizes[i],
+             col_offsets[i]);
+    }
+    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+    /*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+    printf("%d: %d\n", i, row_offsets[i]);
+    }*/
+    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+  }
+//  else { return; }
+  auto block               = block_infos[blockIdx.x];
+  auto const rows_in_block = block.end_row - block.start_row + 1;
+  auto const cols_in_block = block.end_col - block.start_col + 1;
+  extern __shared__ int8_t shared_data[];
+
+  // copy data from our block's window to shared memory
+  // offsets information can get us on the row, then we need to know where the column
+  // starts to offset into the row data.
+
+  // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding
+  // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the
+  // thread may copy less than 8 bytes. Even if at the beginning of the window, because
+  // every internal copy is aligned to 8-byte boundaries.
+  //
+  //  thread 0 thread 1 thread 2 thread 3 thread 4 thread 5
+  //  01234567 89abcdef 01234567 89abcdef 01234567 89abcdef
+  //  xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx
+  // |        |        |        |        |        |        |
+  //
+  //
+
+  auto const window_start_quad = col_offsets[block.start_col] / 8;
+  auto const window_end_quad   = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8;
+  auto const window_quad_width = window_end_quad - window_start_quad;
+  auto const total_quads       = window_quad_width * rows_in_block;
+  auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7;
+
+  if (debug_print) {
+    printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]);
+    printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad);
+    printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad);
+  }
+
+  // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't
+  // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will
+  // result in the window edges being duplicated across blocks, but we can copy the padding as well
+  // to speed up our transfers to shared memory.
+  for (int i = threadIdx.x; i < total_quads; i += blockDim.x) {
+    auto const relative_row = i / window_quad_width;
+    auto const absolute_row = relative_row + block.start_row;
+    //auto const row           = i / window_quad_width;
+    auto const offset_in_row = i % window_quad_width * 8;
+    auto const shmem_dest    = &shared_data[i * 8];
+
+    if (debug_print) {
+      printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest);
+      printf("offsets is %p\n", offsets);
+      printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]);
+      printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row);
+    }
+
+    // full 8-byte copy
+    const int64_t *long_col_input =
+      reinterpret_cast<const int64_t *>(&input_data[offsets[absolute_row] + offset_in_row]);
+    if (debug_print) { 
+      printf("which will be address %p\n", long_col_input);
+      printf("%p <- long %lu\n", shmem_dest, *long_col_input); }
+    *reinterpret_cast<int64_t *>(shmem_dest) = *long_col_input;
+  }
+
+  __syncthreads();
+
+  // now we copy from shared memory to final destination.
+  // the data is laid out in rows in shared memory, so the reads
+  // for a column will be "vertical". Because of this and the different
+  // sizes for each column, this portion is handled on row/column basis.
+  // to prevent each thread working on a single row and also to ensure
+  // that all threads can do work in the case of more threads than rows,
+  // we do a global index instead of a double for loop with col/row.
+  for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+    auto const relative_col = index % cols_in_block;
+    auto const relative_row = index / cols_in_block;
+    auto const absolute_col = relative_col + block.start_col;
+    auto const absolute_row = relative_row + block.start_row;
+
+    auto const shared_memory_row_offset = window_quad_width * 8 * relative_row;
+    auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] +
+                                      shared_memory_row_offset + shared_memory_starting_pad;
+    auto const column_size = col_sizes[absolute_col];
+
+    int8_t *shmem_src = &shared_data[shared_memory_offset];
+    int8_t *dst       = &output_data[absolute_col][absolute_row * column_size];
+
+    if (debug_print) {
+      printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
+      " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size,
+    shmem_src, dst) ;
+    }
+    switch (column_size) {
+      case 1: {
+        if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); }
+        *dst = *shmem_src;
+        break;
+      }
+      case 2: {
+        const int16_t *short_col_input = reinterpret_cast<const int16_t *>(shmem_src);
+        if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); }
+        *reinterpret_cast<int16_t *>(dst) = *short_col_input;
+        break;
+      }
+      case 4: {
+        const int32_t *int_col_input = reinterpret_cast<const int32_t *>(shmem_src);
+        if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); }
+        *reinterpret_cast<int32_t *>(dst) = *int_col_input;
+        break;
+      }
+      case 8: {
+        const int64_t *long_col_input = reinterpret_cast<const int64_t *>(shmem_src);
+        if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); }
+        *reinterpret_cast<int64_t *>(dst) = *long_col_input;
+        break;
+      }
+      default: {
+        if (debug_print) {
+          printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col);
+        }
+        // TODO this should just not be supported for fixed width columns, but just in case...
+        for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; }
+        break;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  // now handle validity. Each thread is responsible for 32 rows in a single column.
+  // to prevent indexing issues with a large number of threads, this is compressed
+  // to a single loop like above. TODO: investigate using shared memory here
+  auto const validity_batches_per_col = (num_rows + 31) / 32;
+  auto const validity_batches_total   = validity_batches_per_col * num_columns;
+  if (debug_print) {
+    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows);
+  }
+  for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) {
+    // what column is this?
+    auto const col             = index / validity_batches_per_col;
+    auto const batch           = index % validity_batches_per_col;
+    auto const starting_row    = batch * 32;
+    auto const validity_offset = col_offsets[num_columns] + col / 8;
+
+    if (debug_print) {
+      printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
+    }
+
+    int32_t dst_validity = 0;
+    for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
+      int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
+
+      if (debug_print) {
+        printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset);
+      }
+  
+      auto const val_byte     = *validity_ptr;
+      auto const src_shift    = col % 8;
+      auto const dst_shift    = row % 32;
+      auto const src_bit_mask = 1 << src_shift;
+      if (debug_print) {
+        printf("src bit mask is 0x%x\n", src_bit_mask);
+        printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift);
+        printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift);
+      }
+//      auto const dst_bit_mask = 1 << dst_shift;
+      dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
+      if (debug_print) {
+        printf("validity is now 0x%x\n", dst_validity);
+      }
+    }
+    
+
+    int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[col] + (starting_row / 32));
+    if (debug_print) {
+      printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32));
+      printf("validity to write is %d\n", dst_validity);
+      printf("validity write %p <- %d\n", validity_ptr, dst_validity);
+    }
+    *validity_ptr         = dst_validity;
+  }
+}
+
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
  * @param [in] num_columns the number of columns being copied.
@@ -764,21 +1011,165 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   return align_offset(at_offset, 8);  // 8 bytes (64 bits)
 }
 
-}  // namespace detail
+template <typename iterator>
+static size_type compute_column_information(
+  iterator begin,
+  iterator end,
+  std::vector<size_type> &column_starts,
+  std::vector<size_type> &column_sizes)//,
+  //std::function<void(T)> nested_type_cb)
+{
+  size_type fixed_width_size_per_row = 0;
+  for (auto cv = begin; cv != end; ++cv) {
+    auto col_type    = std::get<0>(*cv);
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+//    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+
+  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  column_starts.push_back(validity_offset);
+
+  return fixed_width_size_per_row;
+}
 
 //#define DEBUG
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr)
+
+static std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
+                                                 std::vector<size_type> const &column_starts,
+                                                 std::vector<row_batch> const &row_batches,
+                                                 size_type const total_number_of_rows,
+                                                 size_type const &shmem_limit_per_block)
 {
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
-  const size_type num_columns     = tbl.num_columns();
-  const size_type num_rows        = tbl.num_rows();
+  std::vector<block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_width     = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
+                        int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int i                        = 0;
+    while (i < total_number_of_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(detail::block_info{
+        start_col,
+        current_window_start_row,
+        end_col,
+        std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
+        current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+  // would be memory cache line sized access, but since other blocks will read/write the edges this
+  // may not turn out to be overly important. For now, we will attempt to build a square window as
+  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
+  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
+  // bytes, not rows or columns.
+  int const window_height = std::min(
+    std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows),
+    row_batches[0].row_count);
+#if defined(DEBUG)
+  printf(
+    "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height "
+    "%d\n",
+    size_type(sqrt(shmem_limit_per_block)) / column_sizes[0],
+    total_number_of_rows,
+    row_batches[0].row_count,
+    window_height);
+#endif
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
+    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col  = row_size_aligned + col_size;
+    auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
+
+    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
+#if defined(DEBUG)
+      printf(
+        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "for shared mem size %d\n",
+        row_size_with_end_pad * window_height,
+        col,
+        row_size * window_height,
+        current_window_start_col,
+        col - 1,
+        window_height,
+        row_size_with_end_pad,
+        row_size,
+        row_size_aligned,
+        shmem_limit_per_block);
+#endif
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size =
+        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+#if defined(DEBUG)
+      printf(
+        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "or %d)\n",
+        row_size,
+        col_size,
+        row_size + col_size,
+        column_starts[col - 1],
+        column_sizes[col - 1],
+        column_starts[col - 1] + column_sizes[col - 1]);
+#endif
+      row_size += col_size;  // alignment required for shared memory window boundary to match
+                             // alignment of output row
+      current_window_start_col = col;
+      current_window_width     = 0;
+    } else {
+      row_size = row_size_with_this_col;
+      current_window_width++;
+    }
+  }
+
+  // build last set of blocks
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height);
+  }
+
+  return block_infos;
+}
+}  // namespace detail
 
 #if defined(DEBUG)
-  auto pretty_print = [](uint64_t i) {
+  void pretty_print(uint64_t i) {
     if (i > (1 * 1024 * 1024 * 1024)) {
       printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
     } else if (i > (1 * 1024 * 1024)) {
@@ -788,9 +1179,19 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     } else {
       printf("%lu Bytes", i);
     }
-  };
+  }
 #endif
 
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr)
+{
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows    = tbl.num_rows();
+
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
@@ -834,8 +1235,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  auto dev_input_data  = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm    = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr);
+  auto dev_input_nm   = detail::copy_to_dev_async2(input_nm, stream, mr);
 
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
@@ -848,43 +1249,48 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   column_sizes.reserve(num_columns);
   column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
-  size_type fixed_width_size_per_row = 0;
-  for (int col = 0; col < num_columns; ++col) {
-    auto cv          = tbl.column(col);
-    auto col_type    = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+    return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+  });
+
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+    iter,
+    iter + num_columns,
+    column_starts,
+    column_sizes);//,
+//    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
+  /*  size_type fixed_width_size_per_row = 0;
+    for (int col = 0; col < num_columns; ++col) {
+      auto cv          = tbl.column(col);
+      auto col_type    = cv.type();
+      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+      if (nested_type) { variable_width_columns.push_back(cv); }
+
+      // a list or string column will write a single uint64
+      // of data here for offset/length
+      auto col_size = nested_type ? 8 : size_of(col_type);
+
+      // align size for this type
+      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+      column_starts.push_back(fixed_width_size_per_row);
+      column_sizes.push_back(col_size);
+      fixed_width_size_per_row += col_size;
+    }*/
 
-    if (nested_type) { variable_width_columns.push_back(cv); }
+#if defined(DEBUG)
+  printf("validity offset will be %d + %d = %d\n",
+         column_starts.back(),
+         column_sizes.back(),
+         column_starts.back() + column_sizes.back());
+#endif
 
-    // a list or string column will write a single uint64
-    // of data here for offset/length
-    auto col_size = nested_type ? 8 : size_of(col_type);
 
-    // align size for this type
-    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-    column_starts.push_back(fixed_width_size_per_row);
-    column_sizes.push_back(col_size);
-    fixed_width_size_per_row += col_size;
-  }
+  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
 
-  auto dev_col_sizes   = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts  = detail::copy_to_dev_async2(column_starts, stream, mr);
-
-  // When building the columns to return, we have to be mindful of the offset limit in cudf.
-  // It is 32-bit and these data columns are capable of surpassing that easily. The data should
-  // not be cut off exactly at the limit though due to the validity buffers. The most efficient
-  // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-  // we keep track of the cut points for the validity, which we call row batches. If the row
-  // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-  // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-  // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-  // the outgoing data stream.
-  struct row_batch {
-    size_type num_bytes;
-    size_type row_count;
-  };
-  std::vector<row_batch> row_batches;
+  std::vector<detail::row_batch> row_batches;
 
   auto calculate_variable_width_row_data_size = [](int const row) {
     // each level of variable-width data will add an offset/length
@@ -936,10 +1342,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
                      calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
 
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits<size_type>::max()) {
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
+        (uint64_t)std::numeric_limits<size_type>::max()) {
       // a new batch starts at the last 32-row boundary
       row_batches.push_back(
-        row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
       row_batch_size         = 0;
       row_batch_rows         = row_batch_rows & 31;
       row_offset             = 0;
@@ -954,12 +1361,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+    row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
   auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
 
-  #if defined(DEBUG)
+#if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
@@ -979,125 +1386,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
   auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
 
-  std::vector<detail::block_info> block_infos;
-
-  // block infos are organized with the windows going "down" the columns
-  // this provides the most coalescing of memory access
-  int current_window_width     = 0;
-  int current_window_start_col = 0;
-
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, num_rows](
-                        int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int i                        = 0;
-    while (i < num_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(
-        detail::block_info{start_col,
-                           current_window_start_row,
-                           end_col,
-                           std::min(current_window_start_row + window_height - 1, num_rows - 1),
-                           current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
-  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized
-  // access, but since other blocks will read/write the edges this may not turn out to be overly important.
-  // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size.
-  // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are
-  // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns.
-  int const window_height =
-    std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count);
-#if defined(DEBUG)
-  printf(
-    "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height "
-    "%d\n",
-    max_window_height,
-    num_rows,
-    row_batches[0].row_count,
-    window_height);
-#endif
-
-  int row_size = 0;
-
-  // march each column and build the blocks of appropriate sizes
-  for (int col = 0; col < num_columns; ++col) {
-    auto const col_size = column_sizes[col];
-
-    // align size for this type
-    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
-    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
-    auto row_size_with_this_col  = row_size_aligned + col_size;
-    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
-
-    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
-#if defined(DEBUG)
-      printf(
-        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
-        "for shared mem size %d\n",
-        row_size_with_end_pad * window_height,
-        col,
-        row_size * window_height,
-        current_window_start_col,
-        col - 1,
-        window_height,
-        row_size_with_end_pad,
-        row_size,
-        row_size_aligned,
-        shmem_limit_per_block);
-#endif
-      // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
-      row_size =
-        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-#if defined(DEBUG)
-      printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
-        "or %d)\n",
-        row_size,
-        col_size,
-        row_size + col_size,
-        column_starts[col - 1],
-        column_sizes[col - 1],
-        column_starts[col - 1] + column_sizes[col - 1]);
-#endif
-      row_size += col_size;  // alignment required for shared memory window boundary to match
-                             // alignment of output row
-      current_window_start_col = col;
-      current_window_width     = 0;
-    } else {
-      row_size = row_size_with_this_col;
-      current_window_width++;
-    }
-  }
-
-#if defined(DEBUG)
-  printf("validity offset will be %d + %d = %d\n",
-         column_starts.back(),
-         column_sizes.back(),
-         column_starts.back() + column_sizes.back());
-#endif
-  auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4);
-  column_starts.push_back(validity_offset);
-
-  // build last set of blocks
-  if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height);
-  }
-
+  std::vector<detail::block_info> block_infos =
+    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
 #if defined(DEBUG)
   printf("%lu windows for %d columns, %d rows to fit in ",
@@ -1116,7 +1406,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
-  dim3 threads(std::min(1024, shmem_limit_per_block / 8));
+  #if defined(DEBUG) || 1
+  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size));
+  #else
+  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size));
+  #endif
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
   pretty_print(shmem_limit_per_block);
@@ -1206,11 +1500,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    zero->set_valid(true, stream);
+    zero->set_valid_async(true, stream);
     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
 
     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    step->set_valid(true, stream);
+    step->set_valid_async(true, stream);
     static_cast<ScalarType *>(step.get())
       ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
 
@@ -1238,6 +1532,97 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   }
 }
 
+std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &input,
+                                                std::vector<cudf::data_type> const &schema,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource *mr)
+{
+  // verify that the types are what we expect
+  cudf::column_view child = input.child();
+  cudf::type_id list_type = child.type().id();
+  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  cudf::size_type num_columns = schema.size();
+  cudf::size_type num_rows    = input.parent().size();
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int shmem_limit_per_block;
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  std::vector<cudf::size_type> column_starts;
+  std::vector<cudf::size_type> column_sizes;
+
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
+    return std::make_tuple(schema[i], nullptr);
+  });
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+    iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
+
+  size_type validity_size = (num_columns + 7) / 8;
+
+  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
+
+  // Ideally we would check that the offsets are all the same, etc. but for now
+  // this is probably fine
+  CUDF_EXPECTS(row_size * num_rows == child.size(),
+               "The layout of the data appears to be off");
+  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+
+  // build the row_batches from the passed in list column
+  std::vector<detail::row_batch> row_batches;
+
+  row_batches.push_back(detail::row_batch{child.size(), num_rows});
+
+  // Allocate the columns we are going to write into
+  std::vector<std::unique_ptr<cudf::column>> output_columns;
+  std::vector<int8_t *> output_data;
+  std::vector<cudf::bitmask_type *> output_nm;
+  for (cudf::size_type i = 0; i < num_columns; i++) {
+    auto column = cudf::make_fixed_width_column(
+      schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
+    auto mut = column->mutable_view();
+    output_data.emplace_back(mut.data<int8_t>());
+    output_nm.emplace_back(mut.null_mask());
+    output_columns.emplace_back(std::move(column));
+  }
+
+  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_nm   = detail::copy_to_dev_async2(output_nm, stream, mr);
+
+  std::vector<detail::block_info> block_infos =
+    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+
+  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+
+  dim3 blocks(block_infos.size());
+  #if defined(DEBUG) || 1
+  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
+  #else
+  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size()));
+  #endif
+#if defined(DEBUG)
+  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
+  pretty_print(shmem_limit_per_block);
+  printf(" shared memory\n");
+#endif
+  detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+    num_rows,
+    num_columns,
+    input.offsets().data<size_type>(),
+    dev_output_data.data(),
+    dev_output_nm.data(),
+    dev_col_sizes.data(),
+    dev_col_starts.data(),
+    dev_block_infos.data(),
+    child.data<int8_t>());
+
+  return std::make_unique<cudf::table>(std::move(output_columns));
+}
+
 std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
                                                std::vector<cudf::data_type> const &schema,
                                                rmm::cuda_stream_view stream,
@@ -1318,4 +1703,20 @@ std::unique_ptr<cudf::table> convert_from_rows(
   //    }
 }
 
+std::unique_ptr<cudf::table> convert_from_rows2(
+  std::vector<std::unique_ptr<cudf::column>> const &input,
+  std::vector<cudf::data_type> const &schema,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
+
+  //    for (uint i=0; i<input.size(); ++i) {
+  cudf::lists_column_view lcv = input[0]->view();
+  auto ret                    = convert_from_rows2(lcv, schema, stream, mr);
+
+  return ret;
+  //    }
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index c02f83ad1d5..818d7a89ddb 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -21,9 +21,13 @@
 
 #include <thrust/iterator/counting_iterator.h>
 #include <cudf/row_conversion.hpp>
+#include "cudf/lists/lists_column_view.hpp"
+#include "cudf/types.hpp"
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {
 };
+struct RowToColumnTests : public cudf::test::BaseFixture {
+};
 
 TEST_F(ColumnToRowTests, Single)
 {
@@ -108,3 +112,105 @@ TEST_F(ColumnToRowTests, SingleByteWide)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
 }
+
+TEST_F(RowToColumnTests, Single)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Simple)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Tall)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Wide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, SingleByteWide)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows(in);
+  std::vector<cudf::data_type> schema;
+  schema.reserve(in.num_columns());
+  for (auto col = in.begin(); col < in.end(); ++col) {
+    schema.push_back(col->type());
+  }
+  for (uint i=0; i<old_rows.size(); ++i) {
+    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}

From cdd02d0f9812b954443dfc66c1bbb5124bcb7130 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 8 Jul 2021 20:45:18 +0000
Subject: [PATCH 47/80] updating to use make_device_uvector_async and bitmask
 functions per review comments

---
 cpp/src/row_conversion/row_conversion.cu | 125 +++++++++--------------
 1 file changed, 47 insertions(+), 78 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index c0e78a03576..c73e967cf0f 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -21,6 +21,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -36,6 +37,7 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
+using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
 namespace detail {
@@ -45,32 +47,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-/**
- * Copy a simple vector to device memory asynchronously. Be sure to read
- * the data on the same stream as is used to copy it.
- */
-template <typename T>
-std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
-                                                          rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource *mr)
-{
-  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
-  CUDA_TRY(cudaMemcpyAsync(
-    ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
-  return ret;
-}
-
-template <typename T>
-rmm::device_uvector<T> copy_to_dev_async2(const std::vector<T> &input,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource *mr)
-{
-  rmm::device_uvector<T> ret(input.size(), stream, mr);
-  CUDA_TRY(cudaMemcpyAsync(
-    ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value()));
-  return ret;
-}
-
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type num_columns,
                                             const cudf::size_type row_size,
@@ -180,8 +156,8 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         }
 
         cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
+        cudf::size_type byte_bit_offset = intra_word_index(col_index);
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
         if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
@@ -278,8 +254,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
+        cudf::size_type byte_bit_offset = intra_word_index(col_index);
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -505,8 +481,8 @@ __global__ void copy_from_columns(const size_type num_rows,
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
         int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8];
-        cudf::size_type byte_bit_offset = col % 8;
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)];
+        cudf::size_type byte_bit_offset = intra_word_index(col);
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -648,7 +624,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0;
+  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -806,7 +782,7 @@ __global__ void copy_to_columns(const size_type num_rows,
     auto const col             = index / validity_batches_per_col;
     auto const batch           = index % validity_batches_per_col;
     auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + col / 8;
+    auto const validity_offset = col_offsets[num_columns] + word_index(col);
 
     if (debug_print) {
       printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
@@ -821,7 +797,7 @@ __global__ void copy_to_columns(const size_type num_rows,
       }
   
       auto const val_byte     = *validity_ptr;
-      auto const src_shift    = col % 8;
+      auto const src_shift    = intra_word_index(col);
       auto const dst_shift    = row % 32;
       auto const src_bit_mask = 1 << src_shift;
       if (debug_print) {
@@ -920,10 +896,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   const cudf::size_type num_rows,
   const cudf::size_type num_columns,
   const cudf::size_type size_per_row,
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
-  std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
-  std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
+  rmm::device_uvector<cudf::size_type> &column_start,
+  rmm::device_uvector<cudf::size_type> &column_size,
+  rmm::device_uvector<const int8_t *> &input_data,
+  rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
   const cudf::scalar &zero,
   const cudf::scalar &scalar_size_per_row,
   rmm::cuda_stream_view stream,
@@ -954,10 +930,10 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
     num_rows,
     num_columns,
     size_per_row,
-    column_start->data(),
-    column_size->data(),
-    input_data->data(),
-    input_nm->data(),
+    column_start.data(),
+    column_size.data(),
+    input_data.data(),
+    input_nm.data(),
     data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows,
@@ -1004,7 +980,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  int32_t validity_bytes_needed = word_index(schema.size() + 7);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1235,8 +1211,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
   }
 
-  auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr);
-  auto dev_input_nm   = detail::copy_to_dev_async2(input_nm, stream, mr);
+  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+  auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
@@ -1287,8 +1263,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 #endif
 
 
-  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
-  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
+  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
   std::vector<detail::row_batch> row_batches;
 
@@ -1322,16 +1298,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   size_type row_batch_rows  = 0;
   uint64_t row_offset       = 0;
 
-  auto calculate_validity_size = [](int const num_cols) {
-    // Now we need to add in space for validity
-    // Eventually we can think about nullable vs not nullable, but for now we will just always add
-    // it in
-    return (num_cols + 7) / 8;
-  };
-
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = calculate_validity_size(num_columns);
+  auto validity_size = num_bitmask_words(num_columns);
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -1364,7 +1333,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
-  auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr);
+  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
 
 #if defined(DEBUG)
   printf("%d rows and %d columns in table\n", num_rows, num_columns);
@@ -1384,7 +1353,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   std::vector<detail::block_info> block_infos =
     build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
@@ -1402,7 +1371,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf(" total):\n");
 #endif
 
-  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   // blast through the entire table and convert it
   dim3 blocks(block_infos.size());
@@ -1443,7 +1412,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     }
     offset_offset += row_batches[i].row_count;
 
-    auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr);
+    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
     auto offsets     = std::make_unique<column>(
       data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
 
@@ -1477,8 +1446,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     std::vector<cudf::size_type> column_size;
 
     int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
+    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
 
     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
@@ -1495,8 +1464,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
-    auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr);
-    auto dev_input_nm   = detail::copy_to_dev_async(input_nm, stream, mr);
+    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
@@ -1561,7 +1530,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   size_type fixed_width_size_per_row = detail::compute_column_information(
     iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
 
-  size_type validity_size = (num_columns + 7) / 8;
+  size_type validity_size = num_bitmask_words(num_columns);
 
   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
 
@@ -1569,8 +1538,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   // this is probably fine
   CUDF_EXPECTS(row_size * num_rows == child.size(),
                "The layout of the data appears to be off");
-  auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr);
-  auto dev_col_sizes  = detail::copy_to_dev_async2(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
@@ -1590,13 +1559,13 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
     output_columns.emplace_back(std::move(column));
   }
 
-  auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr);
-  auto dev_output_nm   = detail::copy_to_dev_async2(output_nm, stream, mr);
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+  auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
   std::vector<detail::block_info> block_infos =
     build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
-  auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   dim3 blocks(block_infos.size());
   #if defined(DEBUG) || 1
@@ -1647,8 +1616,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     // this is probably fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size  = detail::copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream);
+    auto dev_column_size = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -1663,8 +1632,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       output_columns.emplace_back(std::move(column));
     }
 
-    auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr);
-    auto dev_output_nm   = detail::copy_to_dev_async(output_nm, stream, mr);
+    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
@@ -1675,10 +1644,10 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       num_rows,
       num_columns,
       size_per_row,
-      dev_column_start->data(),
-      dev_column_size->data(),
-      dev_output_data->data(),
-      dev_output_nm->data(),
+      dev_column_start.data(),
+      dev_column_size.data(),
+      dev_output_data.data(),
+      dev_output_nm.data(),
       child.data<int8_t>());
 
     return std::make_unique<cudf::table>(std::move(output_columns));

From 7bb049655fa35a4453cd705b8740e30eb2041533 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 13 Jul 2021 07:18:49 +0000
Subject: [PATCH 48/80] updating conversion code. Found out bit operations are
 on 32-bit values, so they can't be used since row data has byte-aligned
 validity. Performance improvements on the row to column side.

---
 cpp/src/row_conversion/row_conversion.cu | 106 ++++++++++++-----------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index c73e967cf0f..0879a1c50a5 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -37,6 +37,8 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/iterator/transform_iterator.h"
 
+#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2)
+
 using cudf::detail::make_device_uvector_async;
 namespace cudf {
 
@@ -156,11 +158,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
         }
 
         cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
-        cudf::size_type byte_bit_offset = intra_word_index(col_index);
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
+        if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; }
       }  // end column loop
     }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
@@ -254,8 +256,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[word_index(col_index)];
-        cudf::size_type byte_bit_offset = intra_word_index(col_index);
+        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -481,8 +483,8 @@ __global__ void copy_from_columns(const size_type num_rows,
         // we do this directly in the final location because the entire row may not
         // fit in shared memory and may require many blocks to process it entirely
         int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)];
-        cudf::size_type byte_bit_offset = intra_word_index(col);
+          &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col  / 8)];
+        cudf::size_type byte_bit_offset = col % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
@@ -597,6 +599,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  *
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
  * @param offsets
  * @param output_data
  * @param output_nm
@@ -608,6 +611,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  */
 __global__ void copy_to_columns(const size_type num_rows,
                                 const size_type num_columns,
+                                const size_type shmem_used_per_block,
                                 const size_type *offsets,
                                 int8_t **output_data,
                                 cudf::bitmask_type **output_nm,
@@ -624,18 +628,10 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
+  constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("Column Info:\n");
-    for (int i = 0; i < num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n",
-             i,
-             output_data[i],
-             col_sizes[i],
-             col_offsets[i]);
-    }
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
     /*    printf("Row Offsets:\n");
     for (int i=0; i<num_rows; ++i) {
@@ -644,7 +640,13 @@ __global__ void copy_to_columns(const size_type num_rows,
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
   }
 //  else { return; }
-  auto block               = block_infos[blockIdx.x];
+
+  for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) {
+    auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset;
+    if (this_block_index > blockDim.x) {
+      break;
+    }
+    auto block               = block_infos[this_block_index];
   auto const rows_in_block = block.end_row - block.start_row + 1;
   auto const cols_in_block = block.end_col - block.start_col + 1;
   extern __shared__ int8_t shared_data[];
@@ -767,61 +769,58 @@ __global__ void copy_to_columns(const size_type num_rows,
     }
   }
 
-  __syncthreads();
-
-  // now handle validity. Each thread is responsible for 32 rows in a single column.
+  // now handle validity. Each thread is responsible for 32 rows in 8 columns.
   // to prevent indexing issues with a large number of threads, this is compressed
   // to a single loop like above. TODO: investigate using shared memory here
   auto const validity_batches_per_col = (num_rows + 31) / 32;
-  auto const validity_batches_total   = validity_batches_per_col * num_columns;
-  if (debug_print) {
-    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows);
+  auto const validity_batches_total   = std::max(1, validity_batches_per_col * (num_columns / 8));
+  if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) {
+    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x);
   }
-  for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) {
-    // what column is this?
-    auto const col             = index / validity_batches_per_col;
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) {
+    auto const start_col       = (index * 8) / validity_batches_per_col;
     auto const batch           = index % validity_batches_per_col;
     auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + word_index(col);
+    auto const validity_offset = col_offsets[num_columns] + (start_col / 8);
 
     if (debug_print) {
-      printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset);
+      printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x);
     }
 
-    int32_t dst_validity = 0;
+    // one for each column
+    int32_t dst_validity[8] = {0};
     for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
       int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
 
       if (debug_print) {
-        printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset);
+        printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row);
       }
   
       auto const val_byte     = *validity_ptr;
-      auto const src_shift    = intra_word_index(col);
-      auto const dst_shift    = row % 32;
-      auto const src_bit_mask = 1 << src_shift;
-      if (debug_print) {
-        printf("src bit mask is 0x%x\n", src_bit_mask);
-        printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift);
-        printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift);
-      }
-//      auto const dst_bit_mask = 1 << dst_shift;
-      dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
-      if (debug_print) {
-        printf("validity is now 0x%x\n", dst_validity);
+
+      for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
+        auto const src_shift    = (start_col + i) % 8;
+        auto const dst_shift    = row % 32;
+        auto const src_bit_mask = 1 << src_shift;
+        if (debug_print) {
+          printf("%d-%d: src bit mask is 0x%x, src shift is 0x%x and dst shift is 0x%x, validity bit is 0x%x\n", threadIdx.x, blockIdx.x, src_bit_mask, src_shift, dst_shift, (val_byte & src_bit_mask) >> src_shift);
+        }
+  //      auto const dst_bit_mask = 1 << dst_shift;
+        dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
       }
     }
     
 
-    int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[col] + (starting_row / 32));
-    if (debug_print) {
-      printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32));
-      printf("validity to write is %d\n", dst_validity);
-      printf("validity write %p <- %d\n", validity_ptr, dst_validity);
+    for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
+      int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[start_col + i] + (starting_row / 32));
+      if (debug_print) {
+        printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]);
+      }
+      *validity_ptr         = dst_validity[i];
     }
-    *validity_ptr         = dst_validity;
   }
 }
+}
 
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
@@ -980,7 +979,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = word_index(schema.size() + 7);
+  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1300,7 +1299,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = num_bitmask_words(num_columns);
+  auto validity_size = num_bitmask_words(num_columns) * 4;
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -1521,6 +1520,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
+  shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS;
+
   std::vector<cudf::size_type> column_starts;
   std::vector<cudf::size_type> column_sizes;
 
@@ -1530,7 +1531,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   size_type fixed_width_size_per_row = detail::compute_column_information(
     iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
 
-  size_type validity_size = num_bitmask_words(num_columns);
+  size_type validity_size = num_bitmask_words(num_columns) * 4;
 
   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
 
@@ -1567,7 +1568,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
-  dim3 blocks(block_infos.size());
+  dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
   #if defined(DEBUG) || 1
   dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
   #else
@@ -1581,6 +1582,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
   detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
     num_rows,
     num_columns,
+    shmem_limit_per_block,
     input.offsets().data<size_type>(),
     dev_output_data.data(),
     dev_output_nm.data(),

From 2b069caf7e7d34077e37d7a1fdb92439472527fc Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 13 Sep 2021 19:46:03 +0000
Subject: [PATCH 49/80] updating for memcpy_async and validation in a different
 kernel

---
 .../row_conversion/row_conversion.cpp         |   47 +-
 cpp/include/cudf/row_conversion.hpp           |   38 +-
 cpp/src/row_conversion/row_conversion.cu      | 1926 ++++++++++++-----
 cpp/tests/row_conversion/row_conversion.cpp   |  132 +-
 java/src/main/native/src/row_conversion.cu    | 1293 ++++++++++-
 java/src/main/native/src/row_conversion.hpp   |   12 +
 6 files changed, 2714 insertions(+), 734 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 7c1f52c5cd6..ad9925e9043 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -20,7 +20,8 @@
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/row_conversion.hpp>
-#include "cudf_test/column_utilities.hpp"
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf_test/column_utilities.hpp>
 
 class RowConversion : public cudf::benchmark {
 };
@@ -39,9 +40,6 @@ static void BM_old_to_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          212,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-    64,
-    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -52,7 +50,7 @@ static void BM_old_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto rows = cudf::convert_to_rows(table->view());
+    auto rows = cudf::old_convert_to_rows(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -72,9 +70,6 @@ static void BM_new_to_row(benchmark::State& state)
                                           cudf::type_id::UINT64},
                                          212,
                                          row_count{n_rows});
-  /*  auto const table = create_random_table({cudf::type_id::INT32},
-    64,
-    row_count{n_rows});*/
 
   cudf::size_type total_bytes = 0;
   for (int i = 0; i < table->num_columns(); ++i) {
@@ -85,7 +80,7 @@ static void BM_new_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto new_rows = cudf::convert_to_rows2(table->view());
+    auto new_rows = cudf::convert_to_rows(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -114,12 +109,13 @@ static void BM_old_from_row(benchmark::State& state)
     total_bytes += cudf::size_of(t);
   }
 
-  auto rows = cudf::convert_to_rows(table->view());
+  auto rows = cudf::old_convert_to_rows(table->view());
+  cudf::lists_column_view const first_list(rows.front()->view());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto out = cudf::convert_from_rows(rows, schema);
+    auto out = cudf::old_convert_from_rows(first_list, schema);
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -148,36 +144,37 @@ static void BM_new_from_row(benchmark::State& state)
     total_bytes += cudf::size_of(t);
   }
 
-  auto rows = cudf::convert_to_rows(table->view());
+  auto rows = cudf::old_convert_to_rows(table->view());
+  cudf::lists_column_view const first_list(rows.front()->view());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto out = cudf::convert_from_rows2(rows, schema);
+    auto out = cudf::convert_from_rows(first_list, schema);
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
 }
 
 #define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)               \
-  (::benchmark::State & st) { f(st); }                  \
-  BENCHMARK_REGISTER_F(RowConversion, name)             \
-    ->RangeMultiplier(8)                                \
-    ->Ranges({{1 << 6, 1 << 20}})                       \
-    ->UseManualTime()                                   \
+  BENCHMARK_DEFINE_F(RowConversion, name)           \
+  (::benchmark::State & st) { f(st); }              \
+  BENCHMARK_REGISTER_F(RowConversion, name)         \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 6, 1 << 20}})                   \
+    ->UseManualTime()                               \
     ->Unit(benchmark::kMillisecond);
 
 TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
 TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
 
 #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)          \
-  (::benchmark::State & st) { f(st); }   \
-  BENCHMARK_REGISTER_F(RowConversion, name)        \
-    ->RangeMultiplier(8)                           \
-    ->Ranges({{1 << 6, 1 << 20}})                  \
-    ->UseManualTime()                              \
+  BENCHMARK_DEFINE_F(RowConversion, name)             \
+  (::benchmark::State & st) { f(st); }                \
+  BENCHMARK_REGISTER_F(RowConversion, name)           \
+    ->RangeMultiplier(8)                              \
+    ->Ranges({{1 << 6, 1 << 20}})                     \
+    ->UseManualTime()                                 \
     ->Unit(benchmark::kMillisecond);
 
 FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index 282ffa4b0cb..8f82d01b06c 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -24,40 +24,28 @@
 
 namespace cudf {
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
-  cudf::table_view const &tbl,
+std::vector<std::unique_ptr<cudf::column>> old_convert_to_rows(
+  cudf::table_view const& tbl,
   // TODO need something for validity
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(
-  cudf::table_view const &tbl,
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
+  cudf::table_view const& tbl,
   // TODO need something for validity
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::table> convert_from_rows(
-  cudf::lists_column_view const &input,
-  std::vector<cudf::data_type> const &schema,
+std::unique_ptr<cudf::table> old_convert_from_rows(
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table> convert_from_rows(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows2(
-  cudf::lists_column_view const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows2(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace cudf
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 0879a1c50a5..42c40e0542d 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -18,26 +18,42 @@
 #include <iostream>
 #include <iterator>
 #include <limits>
+#include <tuple>
+#include <type_traits>
+
+#include <cooperative_groups.h>
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#include <cuda/barrier>
+#endif
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <cudf/row_conversion.hpp>
-#include <tuple>
-#include "cudf/types.hpp"
-#include "rmm/device_buffer.hpp"
-#include "thrust/iterator/counting_iterator.h"
-#include "thrust/iterator/transform_iterator.h"
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 
-#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2)
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS      = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS    = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED          = 2;
+constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL        = 8;
+constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
+#endif
 
 using cudf::detail::make_device_uvector_async;
 namespace cudf {
@@ -52,11 +68,11 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type num_columns,
                                             const cudf::size_type row_size,
-                                            const cudf::size_type *input_offset_in_row,
-                                            const cudf::size_type *num_bytes,
-                                            int8_t **output_data,
-                                            cudf::bitmask_type **output_nm,
-                                            const int8_t *input_data)
+                                            const cudf::size_type* input_offset_in_row,
+                                            const cudf::size_type* num_bytes,
+                                            int8_t** output_data,
+                                            cudf::bitmask_type** output_nm,
+                                            const int8_t* input_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -81,15 +97,15 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
 
   // Because we are copying fixed width only data and we stride the rows
   // this thread will always start copying from shared data in the same place
-  int8_t *row_tmp     = &shared_data[row_size * threadIdx.x];
-  int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+  int8_t* row_tmp     = &shared_data[row_size * threadIdx.x];
+  int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
     // Step 1: Copy the data into shared memory
     // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t *long_shared      = reinterpret_cast<int64_t *>(shared_data);
-    const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
+    int64_t* long_shared      = reinterpret_cast<int64_t*>(shared_data);
+    const int64_t* long_input = reinterpret_cast<int64_t const*>(input_data);
 
     cudf::size_type shared_output_index  = threadIdx.x + (threadIdx.y * blockDim.x);
     cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
@@ -125,26 +141,26 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
         cudf::size_type col_size = num_bytes[col_index];
-        const int8_t *col_tmp    = &(row_tmp[input_offset_in_row[col_index]]);
-        int8_t *col_output       = output_data[col_index];
+        const int8_t* col_tmp    = &(row_tmp[input_offset_in_row[col_index]]);
+        int8_t* col_output       = output_data[col_index];
         switch (col_size) {
           case 1: {
             col_output[row_index] = *col_tmp;
             break;
           }
           case 2: {
-            int16_t *short_col_output   = reinterpret_cast<int16_t *>(col_output);
-            short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
+            int16_t* short_col_output   = reinterpret_cast<int16_t*>(col_output);
+            short_col_output[row_index] = *reinterpret_cast<const int16_t*>(col_tmp);
             break;
           }
           case 4: {
-            int32_t *int_col_output   = reinterpret_cast<int32_t *>(col_output);
-            int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
+            int32_t* int_col_output   = reinterpret_cast<int32_t*>(col_output);
+            int_col_output[row_index] = *reinterpret_cast<const int32_t*>(col_tmp);
             break;
           }
           case 8: {
-            int64_t *long_col_output   = reinterpret_cast<int64_t *>(col_output);
-            long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
+            int64_t* long_col_output   = reinterpret_cast<int64_t*>(col_output);
+            long_col_output[row_index] = *reinterpret_cast<const int64_t*>(col_tmp);
             break;
           }
           default: {
@@ -157,12 +173,12 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
           }
         }
 
-        cudf::bitmask_type *nm          = output_nm[col_index];
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        cudf::bitmask_type* nm          = output_nm[col_index];
+        int8_t* valid_byte              = &row_vld_tmp[col_index / 8];
         cudf::size_type byte_bit_offset = col_index % 8;
         int predicate                   = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; }
+        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
       }  // end column loop
     }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
@@ -174,11 +190,11 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
                                               const cudf::size_type num_rows,
                                               const cudf::size_type num_columns,
                                               const cudf::size_type row_size,
-                                              const cudf::size_type *output_offset_in_row,
-                                              const cudf::size_type *num_bytes,
-                                              const int8_t **input_data,
-                                              const cudf::bitmask_type **input_nm,
-                                              int8_t *output_data)
+                                              const cudf::size_type* output_offset_in_row,
+                                              const cudf::size_type* num_bytes,
+                                              const int8_t** input_data,
+                                              const cudf::bitmask_type** input_nm,
+                                              int8_t* output_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -205,8 +221,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
 
   // Because we are copying fixed width only data and we stride the rows
   // this thread will always start copying to shared data in the same place
-  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-  int8_t *row_vld_tmp =
+  int8_t* row_tmp = &shared_data[row_size * threadIdx.x];
+  int8_t* row_vld_tmp =
     &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
@@ -223,26 +239,26 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
         cudf::size_type col_size = num_bytes[col_index];
-        int8_t *col_tmp          = &(row_tmp[output_offset_in_row[col_index]]);
-        const int8_t *col_input  = input_data[col_index];
+        int8_t* col_tmp          = &(row_tmp[output_offset_in_row[col_index]]);
+        const int8_t* col_input  = input_data[col_index];
         switch (col_size) {
           case 1: {
             *col_tmp = col_input[row_index];
             break;
           }
           case 2: {
-            const int16_t *short_col_input        = reinterpret_cast<const int16_t *>(col_input);
-            *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
+            const int16_t* short_col_input       = reinterpret_cast<const int16_t*>(col_input);
+            *reinterpret_cast<int16_t*>(col_tmp) = short_col_input[row_index];
             break;
           }
           case 4: {
-            const int32_t *int_col_input          = reinterpret_cast<const int32_t *>(col_input);
-            *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
+            const int32_t* int_col_input         = reinterpret_cast<const int32_t*>(col_input);
+            *reinterpret_cast<int32_t*>(col_tmp) = int_col_input[row_index];
             break;
           }
           case 8: {
-            const int64_t *long_col_input         = reinterpret_cast<const int64_t *>(col_input);
-            *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
+            const int64_t* long_col_input        = reinterpret_cast<const int64_t*>(col_input);
+            *reinterpret_cast<int64_t*>(col_tmp) = long_col_input[row_index];
             break;
           }
           default: {
@@ -256,10 +272,10 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte              = &row_vld_tmp[col_index / 8];
+        int8_t* valid_byte              = &row_vld_tmp[col_index / 8];
         cudf::size_type byte_bit_offset = col_index % 8;
         uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
-        int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
+        int32_t* valid_int              = reinterpret_cast<int32_t*>(valid_byte - fixup_bytes);
         cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
         // Now copy validity for the column
         if (input_nm[col_index]) {
@@ -279,8 +295,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
 
     // Step 2: Copy the data back out
     // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-    int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
+    int64_t* long_shared = reinterpret_cast<int64_t*>(shared_data);
+    int64_t* long_output = reinterpret_cast<int64_t*>(output_data);
 
     cudf::size_type shared_input_index  = threadIdx.x + (threadIdx.y * blockDim.x);
     cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
@@ -303,12 +319,35 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
   }
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
 struct block_info {
   int start_col;
   int start_row;
   int end_col;
   int end_row;
   int buffer_num;
+
+  __host__ __device__ size_type get_row_size(size_type const* const col_offsets,
+                                             size_type const* const col_sizes,
+                                             bool debug_print = false) const
+  {
+    if (debug_print)
+      printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n",
+             end_col,
+             &col_offsets[end_col],
+             end_col,
+             &col_sizes[end_col],
+             start_col,
+             &col_offsets[start_col],
+             col_offsets[end_col],
+             col_sizes[end_col],
+             col_offsets[start_col]);
+    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
+  }
+  __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
+
+  __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
 };
 
 // When building the columns to return, we have to be mindful of the offset limit in cudf.
@@ -341,13 +380,14 @@ struct row_batch {
  */
 __global__ void copy_from_columns(const size_type num_rows,
                                   const size_type num_columns,
-                                  const int8_t **input_data,
-                                  const bitmask_type **input_nm,
-                                  const size_type *col_sizes,
-                                  const size_type *col_offsets,
-                                  const block_info *block_infos,
-                                  const size_type *row_offsets,
-                                  int8_t **output_data)
+                                  const size_type shmem_used_per_block,
+                                  const size_type num_block_infos,
+                                  const int8_t** input_data,
+                                  const size_type* col_sizes,
+                                  const size_type* col_offsets,
+                                  const block_info* block_infos,
+                                  const size_type* row_offsets,
+                                  int8_t** output_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -357,239 +397,597 @@ __global__ void copy_from_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
+  constexpr bool debug_print = false;  // blockIdx.x == 0 && threadIdx.x == 1;
+
+  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  auto group                      = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&block_barrier[i], group.size());
+    }
+  }
+
+  group.sync();
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("Column Info:\n");
-    for (int i = 0; i < num_columns; ++i) {
-      printf("col %d is at %p with size %d and offset %d\n",
-             i,
-             input_data[i],
-             col_sizes[i],
-             col_offsets[i]);
-    }
+    printf("col sizes at %p, col offsets at %p, and row offsets at %p\n",
+           col_sizes,
+           col_offsets,
+           row_offsets);
     printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-    /*    printf("Row Offsets:\n");
-        for (int i=0; i<num_rows; ++i) {
-          printf("%d: %d\n", i, row_offsets[i]);
-        }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+    printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
+    printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
+    printf("group is %d threads\n", group.size());
   }
-  // else { return; }
-  auto block               = block_infos[blockIdx.x];
-  auto const rows_in_block = block.end_row - block.start_row + 1;
+  //  else { return; }
+
+  auto const blocks_remaining =
+    std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS),
+             std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
+                      (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+
+  size_t fetch;
+  size_t subset;
+  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+    // Fetch ahead up to stages_count subsets
+    for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
+      if (debug_print)
+        printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch);
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
+      if (debug_print)
+        printf("block %lu rows %d-%d and cols %d-%d\n",
+               blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch,
+               fetch_block.start_row,
+               fetch_block.end_row,
+               fetch_block.start_col,
+               fetch_block.end_col);
+
+      auto const num_fetch_cols         = fetch_block.num_cols();
+      auto const num_fetch_rows         = fetch_block.num_rows();
+      auto const num_elements_in_block  = num_fetch_cols * num_fetch_rows;
+      auto const fetch_block_row_size   = fetch_block.get_row_size(col_offsets, col_sizes);
+      auto const starting_column_offset = col_offsets[fetch_block.start_col];
+      auto& fetch_barrier               = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
+
+      // wait for the last use of the memory to be completed
+      if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); }
+
+      // to do the copy we need to do n column copies followed by m element copies OR
+      // we have to do m element copies followed by r row copies. When going from column
+      // to row it is much easier to copy by elements first otherwise we would need a running
+      // total of the column sizes for our block, which isn't readily available. This makes it more
+      // appealing to copy element-wise from input data into shared matching the end layout and do
+      // row-based memcopies out.
+
+      for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
+        auto const relative_col = el / num_fetch_rows;
+        auto const relative_row = el % num_fetch_rows;
+        auto const absolute_col = relative_col + fetch_block.start_col;
+        auto const absolute_row = relative_row + fetch_block.start_row;
+        if (debug_print)
+          printf("row %d(%d), col %d(%d), %d fetch rows, element %d\n",
+                 relative_row,
+                 absolute_row,
+                 relative_col,
+                 absolute_col,
+                 num_fetch_rows,
+                 el);
+        auto const col_size            = col_sizes[absolute_col];
+        auto const col_offset          = col_offsets[absolute_col];
+        auto const relative_col_offset = col_offset - starting_column_offset;
+
+        auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
+        auto const input_src     = input_data[absolute_col] + col_size * absolute_row;
+
+        if (debug_print)
+          printf("block %lu to shared chunk %lu. %p <- %p - %d bytes\n",
+                 fetch,
+                 fetch % stages_count,
+                 &shared[fetch % stages_count][shared_offset],
+                 input_src,
+                 col_size);
+
+        // copy the main
+        cuda::memcpy_async(
+          &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier);
+      }
+    }
+
+    auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
+    subset_barrier.arrive_and_wait();
+
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    if (debug_print)
+      printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset);
+
+    /*    auto const rows_in_block  = block.num_rows();
+        auto const cols_in_block  = block.num_cols();*/
+    auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
+    auto const column_offset  = col_offsets[block.start_col];
+
+    // copy entire rows to final dest
+    for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
+         absolute_row += blockDim.x) {
+      auto const relative_row = absolute_row - block.start_row;
+      auto const output_dest =
+        output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
+      if (debug_print)
+        printf("processing row %d\noutput data[%d] is address %p\n",
+               absolute_row,
+               absolute_row,
+               output_dest);
+      auto const shared_offset = block_row_size * relative_row;
+      if (debug_print)
+        printf("memcpy %p <- %p - %d bytes which is row %d\n",
+               output_dest,
+               &shared[subset % stages_count][shared_offset],
+               block_row_size,
+               absolute_row);
+      cuda::memcpy_async(
+        output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier);
+    }
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+    block_barrier[i].arrive_and_wait();
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param offsets
+ * @param output_data pointer to output data, partitioned by data size
+ * @param validity_offsets offset into input data row for validity data
+ * @param block_infos information about the blocks of work
+ * @param num_block_infos number of infos in blocks array
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_validity_from_columns(const size_type num_rows,
+                                           const size_type num_columns,
+                                           const size_type shmem_used_per_block,
+                                           const size_type* row_offsets,
+                                           int8_t** output_data,
+                                           const size_type validity_offset,
+                                           const block_info* block_infos,
+                                           const size_type num_block_infos,
+                                           const bitmask_type** input_nm)
+{
   extern __shared__ int8_t shared_data[];
-  uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row];
-  uint8_t const dest_shim_offset =
-    reinterpret_cast<uint64_t>(&output_data[0][output_start_offset]) &
-    7;  // offset for alignment shim in order to match shared memory with final dest
-  if (debug_print) {
-    printf("outputting to offset %lu\n", output_start_offset);
-    printf("dest shim offset is %d\n", dest_shim_offset);
-    printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024));
-    printf("my block is %d,%d -> %d,%d - buffer %d\n",
-           block.start_col,
-           block.start_row,
-           block.end_col,
-           block.end_row,
-           block.buffer_num);
+  int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+    shared_data, shared_data + shmem_used_per_block / 2};
+
+  constexpr bool print_debug = false;  //(threadIdx.x==0 || threadIdx.x == 32) && blockIdx.x == 0;
+  //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
+  if (print_debug) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("%d %d - block infos are at %p and my index is %d\n",
+           threadIdx.x,
+           blockIdx.x,
+           block_infos,
+           blockIdx.x);
+    printf("%d %d - input nm is %p, input_nm[0] is at %p\n",
+           threadIdx.x,
+           blockIdx.x,
+           input_nm,
+           input_nm[0]);
+    printf("shared memory is %p to %p\n", shared_data, shared_data + shmem_used_per_block * 2);
+    printf("block infos at %p and this is index %d\n",
+           &block_infos,
+           blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + 0);
+    /*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+    printf("%d: %d\n", i, row_offsets[i]);
+    }*/
   }
-  // each thread is responsible for every threadcount rows of data.
-  // the data is copied into shared memory in the final layout.
-  auto const real_bytes_in_row =
-    col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col];
-  auto const shmem_row_size  = align_offset(real_bytes_in_row + dest_shim_offset,
-                                           8);  // 8 byte alignment required for shared memory rows
-  auto const validity_offset = col_offsets[num_columns];
-  if (debug_print) {
-    printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n",
-           block.end_col,
-           col_offsets[block.end_col],
-           block.end_col,
-           col_sizes[block.end_col],
-           block.start_col,
-           col_offsets[block.start_col]);
-    printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row);
-    printf("validity offset is %d\n", validity_offset);
-    printf("starting at %d,%d and going to %d, %d\n",
-           block.start_col,
-           block.start_row,
-           block.end_col,
-           block.end_row);
+  // else { return; }
+
+  // per conversation with DaveB
+  // each thread of warp reads a single int32 of validity - so we read 128 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const blocks_remaining =
+    std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+             (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+
+  if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size());
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+    shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&shared_block_barriers[i], group.size());
+    }
   }
-  for (int col = block.start_col; col <= block.end_col; ++col) {
-    /*if (!col_is_variable) */ {
-      uint64_t col_offset      = 0;
-      cudf::size_type col_size = col_sizes[col];
-      auto const dest_col_offset =
-        col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset;
-      if (debug_print) { printf("dest col offset %d\n", dest_col_offset); }
-      for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
-        if (debug_print) {
-          printf("shmem row %d(%d) at offset %d(%d)\n",
-                 row - block.start_row,
-                 row,
-                 (row - block.start_row) * shmem_row_size,
-                 row * shmem_row_size);
-        }
-        int8_t *shmem_dest =
-          &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)];
-        switch (col_size) {
-          case 1: {
-            if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); }
-            *shmem_dest = input_data[col][row];
-            break;
-          }
-          case 2: {
-            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_data[col]);
-            if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); }
-            *reinterpret_cast<int16_t *>(shmem_dest) = short_col_input[row];
-            break;
-          }
-          case 4: {
-            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_data[col]);
-            if (debug_print) {
-              printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]);
-            }
-            *reinterpret_cast<int32_t *>(shmem_dest) = int_col_input[row];
-            break;
-          }
-          case 8: {
-            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_data[col]);
-            if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); }
-            *reinterpret_cast<int64_t *>(shmem_dest) = long_col_input[row];
-            break;
-          }
-          default: {
-            cudf::size_type input_offset = col_size * row;
-            if (debug_print) {
-              printf("byte for byte copy due to size %d of column %d\n", col_size, col);
-              printf("%p <- input_data[%d] which is %d\n",
-                     shmem_dest,
-                     input_offset,
-                     input_data[col][input_offset]);
-            }
-            // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
-              shmem_dest[b] = input_data[col][b + input_offset];
+
+  group.sync();
+
+  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+    if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+      if (print_debug)
+        printf("%d: waiting at barrier %d\n",
+               threadIdx.x,
+               validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED);
+      shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
+        .arrive_and_wait();
+      if (print_debug) printf("past barrier...\n");
+    }
+    int8_t* this_shared_block = shared_blocks[validity_block % 2];
+    if (print_debug) printf("top of loop for validity block %d\n", validity_block);
+    if (print_debug)
+      printf("reading validity block info %d at %p\n",
+             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
+             &block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]);
+    auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+
+    auto const num_block_cols = block.num_cols();
+    auto const num_block_rows = block.num_rows();
+
+    auto const num_sections_x = (num_block_cols + 31) / 32;
+    auto const num_sections_y = (num_block_rows + 7) / 8;
+    auto const validity_data_row_length =
+      align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
+    auto const total_sections = num_sections_x * num_sections_y;
+
+    if (print_debug) {
+      printf("%d %d - block %d has %d cols, %d rows, %d row length, and %d total sections\n",
+             threadIdx.x,
+             blockIdx.x,
+             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
+             num_block_cols,
+             num_block_rows,
+             validity_data_row_length,
+             total_sections);
+    }
+    int const warp_id          = threadIdx.x / detail::warp_size;
+    int const lane_id          = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+
+    if (print_debug)
+      printf(
+        "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side "
+        "%d\n",
+        threadIdx.x,
+        blockIdx.x,
+        warp_id,
+        total_sections,
+        warps_per_block,
+        blockDim.x,
+        detail::warp_size);
+    // the block is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_block) {
+      // convert to rows and cols
+      auto const section_x = my_section_idx / num_sections_x;
+      auto const section_y = my_section_idx % num_sections_x;
+
+      if (print_debug) printf("working on section %d of %d...\n", section_x, num_sections_x);
+      auto const relative_col = section_x * 32 + lane_id;
+      auto const relative_row = section_y * 8;
+      auto const absolute_col = relative_col + block.start_col;
+      auto const absolute_row = relative_row + block.start_row;
+      auto const cols_left    = num_columns - absolute_col;
+
+      if (print_debug) printf("pre ballot sync...\n");
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
+
+      if (print_debug)
+        printf(
+          "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d absolute)\n",
+          participation_mask,
+          relative_row,
+          absolute_row,
+          relative_col,
+          absolute_col);
+
+      if (absolute_col < num_columns) {
+        if (print_debug)
+          printf(
+            "thread %d's byte is at %p, participation mask is 0x%x for relative row %d(%d real), "
+            "relative col %d(%d absolute)\n",
+            threadIdx.x,
+            &input_nm[absolute_col][absolute_row / 32],
+            participation_mask,
+            relative_row,
+            absolute_row,
+            relative_col,
+            absolute_col);
+        auto my_byte =
+          input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
+
+        if (print_debug)
+          printf(
+            "thread %d's byte is 0x%x, participation mask is 0x%x for relative row %d(%d real), "
+            "relative col %d(%d absolute)\n",
+            threadIdx.x,
+            my_byte & 0xFF,
+            participation_mask,
+            relative_row,
+            absolute_row,
+            relative_col,
+            absolute_col);
+
+        // every thread that is participating in the warp has a byte, but it's column-based
+        // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make
+        // the bytes we actually write.
+        for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+          // lead thread in each warp writes data
+          auto const validity_write_offset =
+            validity_data_row_length * (relative_row + i) + relative_col / 8;
+          if (threadIdx.x % detail::warp_size == 0) {
+            if (print_debug)
+              printf(
+                "%d %d - byte_mask is 0x%x, masked_byte is 0x%x, shared_data_block[%d][%d] = "
+                "0x%x\n",
+                threadIdx.x,
+                blockIdx.x,
+                byte_mask,
+                my_byte & byte_mask,
+                validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
+                validity_write_offset,
+                validity_data);
+            if (cols_left <= 8) {
+              // write byte
+              if (print_debug)
+                printf("writing single byte to shared offset 0x%x which is %p...\n",
+                       validity_write_offset,
+                       &this_shared_block[validity_write_offset]);
+              this_shared_block[validity_write_offset] = validity_data & 0xFF;
+            } else if (cols_left <= 16) {
+              // write int16
+              if (print_debug)
+                printf("writing two bytes to shared offset 0x%x which is %p...\n",
+                       validity_write_offset,
+                       &this_shared_block[validity_write_offset]);
+              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data & 0xFFFF;
+            } else if (cols_left <= 24) {
+              // write int16 and then int8
+              if (print_debug)
+                printf("writing three bytes to shared offset 0x%x which is %p...\n",
+                       validity_write_offset,
+                       &this_shared_block[validity_write_offset]);
+              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data & 0xFFFF;
+              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+            } else {
+              // write int32
+              if (print_debug)
+                printf("writing 4 bytes to shared offset 0x%x which is %p...\n",
+                       validity_write_offset,
+                       &this_shared_block[validity_write_offset]);
+              *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data;
             }
-            break;
           }
         }
+      }
+    }
 
-        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
-        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        // we do this directly in the final location because the entire row may not
-        // fit in shared memory and may require many blocks to process it entirely
-        int8_t *valid_byte =
-          &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col  / 8)];
-        cudf::size_type byte_bit_offset = col % 8;
-        uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
-        int32_t *valid_int              = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
-        cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
-        if (debug_print) { printf("Outputting validity to %p\n", valid_byte); }
-        // Now copy validity for the column
-        if (input_nm[col]) {
-          if (bit_is_set(input_nm[col], row)) {
-            atomicOr_block(valid_int, 1 << int_bit_offset);
-          } else {
-            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
-          }
-        } else {
-          // It is valid so just set the bit
-          atomicOr_block(valid_int, 1 << int_bit_offset);
-        }
-      }  // end row
+    // make sure entire block has finished copy
+    group.sync();
 
-      col_offset += col_sizes[col] * rows_in_block;
+    // now async memcpy the shared memory out to the final destination
+    for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
+      auto const relative_row = row - block.start_row;
+      if (print_debug) {
+        printf(
+          "base output data is %p, row offset is 0x%x, validity offset into row is 0x%x, word "
+          "index of block start is 0x%x\n",
+          output_data[block.buffer_num],
+          row_offsets[row],
+          validity_offset,
+          word_index(block.start_col));
+        printf(
+          "%d %d - row %d/%d/%d col %d-%d - %p = shared_data_block[%d][%d] which is %p -  %d "
+          "bytes\n - %p <- 0x%x\n",
+          threadIdx.x,
+          blockIdx.x,
+          block.start_row,
+          row,
+          block.end_row,
+          block.start_col,
+          block.end_col,
+          output_data[block.buffer_num] + row_offsets[row] + validity_offset +
+            (word_index(block.start_col)),
+          validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
+          validity_data_row_length * relative_row,
+          &this_shared_block[validity_data_row_length * relative_row],
+          util::div_rounding_up_unsafe(num_block_cols, 8),
+          output_data[block.buffer_num] + row_offsets[row] + validity_offset +
+            word_index(block.start_col),
+          this_shared_block[validity_data_row_length * relative_row]);
+      }
+      auto const output_ptr =
+        output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
+      auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+      cuda::memcpy_async(
+        output_ptr,
+        &this_shared_block[validity_data_row_length * relative_row],
+        num_bytes,
+        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+
+      /*      auto const padding_ptr = output_ptr + num_bytes;
+            auto const padding_needed = -reinterpret_cast<int64_t>(padding_ptr) & 7;
+            if (print_debug) printf(
+                "absolute_row: %d, row_offset for this row: 0x%x, validity data bytes: %d, end
+         address: %p, padding bytes %lu\n", row, row_offsets[row], num_bytes, output_ptr +
+         num_bytes, padding_needed); cuda::memcpy_async(padding_ptr, zero, padding_needed,
+         shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+      */
+
+      /*      if (print_debug) {
+              for (int i=0; i<util::div_rounding_up_unsafe(num_block_rows, 8); i+=4) {
+                printf("%d: 0x%02X %02X %02X %02X\n", i * 8,
+         this_shared_block[validity_data_col_length * relative_col + i] & 0xFF,
+         this_shared_block[validity_data_col_length * relative_col + i + 1] & 0xFF,
+         this_shared_block[validity_data_col_length * col + i + 2] & 0xFF,
+         this_shared_block[validity_data_col_length * relative_col + i + 3] & 0xFF);
+              }
+            }*/
     }
-  }  // end col
-
-  // wait for the data to be totally copied into shared memory
-  __syncthreads();
-
-  // Step 2: Copy the data from shared memory to final destination
-  // each block is potentially a slice of the table, so no assumptions
-  // can be made about alignments. We do know that the alignment in shared
-  // memory matches the final destination alignment. Also note that
-  // we are not writing to entirely contiguous destinations as each
-  // row in shared memory may not be an entire row of the destination.
-  //
-  auto const thread_start_offset = threadIdx.x * 8;
-  auto const thread_stride       = blockDim.x * 8;
-  auto const end_offset          = shmem_row_size * rows_in_block;
+    //    if (print_debug) printf("looping...\n");
+  }
 
-  if (debug_print) {
-    printf("writing final data from %d to %d at stride %d\n",
-           thread_start_offset,
-           shmem_row_size * rows_in_block,
-           thread_stride);
-    printf("rows in block %d\n", rows_in_block);
+  //  if (print_debug) printf("leaving...\n");
+  // wait for last blocks of data to arrive
+  for (int validity_block = 0;
+       validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+       ++validity_block) {
+    shared_block_barriers[validity_block].arrive_and_wait();
   }
-  for (auto src_offset = thread_start_offset; src_offset < end_offset;
-       src_offset += thread_stride) {
-    auto const output_row_num = src_offset / shmem_row_size;
-    auto const row_offset     = row_offsets[block.start_row + output_row_num];
-    auto const col_offset     = src_offset % shmem_row_size;
-    int8_t *output_ptr        = &output_data[block.buffer_num][row_offset + col_offset];
-    int8_t *input_ptr         = &shared_data[src_offset];
-
-    // three cases to worry about here
-    // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front
-    // 2) last 8-byte part of a large row - some bytes of pad at the end
-    // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front
-    // AND potentially pad at the rear
-
-    // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily.
-    // 1st case is when we're at some even multiple of shmem_row_size offset.
-    // 2nd case is when offset + 8 is some even multiple of shmem_row_size.
-    // must be an 8 byte copy
-
-    // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize?
-    if (real_bytes_in_row + dest_shim_offset <= 8) {
-      // case 3, we want to copy real_bytes_in_row bytes
-      auto const num_single_bytes = real_bytes_in_row - dest_shim_offset;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf("case 3 - %d single byte final write %p(%d) -> %p\n",
-                 num_single_bytes,
-                 &input_ptr[i + dest_shim_offset],
-                 input_ptr[i + dest_shim_offset],
-                 &output_ptr[i]);
-        }
-        output_ptr[i] = input_ptr[i + dest_shim_offset];
-      }
-    } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) {
-      // first byte with leading pad
-      auto const num_single_bytes = 8 - dest_shim_offset;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf(
-            "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]);
-        }
-        output_ptr[i] = input_ptr[i + dest_shim_offset];
-      }
-    } else if ((src_offset + 8) % shmem_row_size == 0 &&
-               (real_bytes_in_row + dest_shim_offset) % 8 > 0) {
-      // last bytes of a row
-      auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8;
-      for (auto i = 0; i < num_single_bytes; ++i) {
-        if (debug_print) {
-          printf("single trailing byte final write %p -> %p\n",
-                 &input_ptr[i + dest_shim_offset],
-                 &output_ptr[i]);
-        }
-        output_ptr[i] = input_ptr[i + dest_shim_offset];
-      }
-    } else {
-      // copy 8 bytes aligned
-      const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_ptr);
-      if (debug_print) {
-        printf(
-          "long final write %p -> %p\n", long_col_input, reinterpret_cast<int64_t *>(output_ptr));
+}
+
+static __device__ std::tuple<size_type, size_type> get_admin_data_sizes(size_t col_size_size,
+                                                                        size_t col_offset_size,
+                                                                        int const num_cols)
+{
+  auto const col_size_bytes   = num_cols * col_size_size;
+  auto const col_offset_bytes = num_cols * col_offset_size;
+
+  return {col_size_bytes, col_offset_bytes};
+}
+
+/**
+ * @brief ensure `read_ahead` buffer blocks are fetched
+ *
+ * @param fetch_index internal state passed into the function
+ * @param processing_index index where processing is occuring
+ * @param read_ahead_count how many blocks to read ahead
+ * @param max_resident_blocks how many blocks can be loaded at once
+ * @param total_blocks total number of blocks overall
+ * @param block_infos pointer to the block infos
+ * @param col_sizes pointer to column size information
+ * @param col_offsets pointer to the table's column offsets
+ * @param row_offsets pointer to offsets for each row in the table
+ * @param input_data pointer to the input data
+ * @param shared pointer to shared memory
+ * @param group thread group participating in the fetch
+ * @param block_barrier barriers used for each block
+ * @param debug_print
+ * @return
+ */
+static __device__ void fetch_blocks_for_row_to_column(
+  size_t& fetch_index,
+  size_t const processing_index,
+  int const read_ahead_count,
+  int const max_resident_blocks,
+  int const total_blocks,
+  block_info const* const block_infos,
+  size_type const* const col_sizes,
+  size_type const* const col_offsets,
+  size_type const* const row_offsets,
+  int8_t const* const input_data,
+  int8_t* shared[],
+  cooperative_groups::thread_block const group,
+  cuda::barrier<cuda::thread_scope_block>* block_barrier,
+  bool debug_print)
+{
+  for (; fetch_index < static_cast<size_t>(total_blocks) &&
+         fetch_index < (processing_index + read_ahead_count);
+       ++fetch_index) {
+    if (debug_print)
+      printf("fetching block %lu of %d\n",
+             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
+             total_blocks);
+    auto const fetch_block =
+      block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
+    auto const fetch_block_start_row = fetch_block.start_row;
+    auto const fetch_block_end_row   = fetch_block.end_row;
+    auto const starting_col_offset   = col_offsets[fetch_block.start_col];
+
+    auto const fetch_block_row_size         = fetch_block.get_row_size(col_offsets, col_sizes);
+    auto const num_fetch_cols               = fetch_block.num_cols();
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+      sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
+    auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+
+    // if we have fetched all buffers, we need to wait for processing
+    // to complete on them before we can use them again
+    if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); }
+
+    auto shared_row_offset = 0;
+    // copy the data for column sizes
+    if (debug_print)
+      printf("%d: col sizes memcpy_async(group, %p, %p, %d, barrier);\n",
+             threadIdx.x,
+             &shared[fetch_index % max_resident_blocks][shared_row_offset],
+             &col_offsets[fetch_block.start_col],
+             col_size_bytes);
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+      printf("%d-%d fetching to %p with barrier %p\n",
+             threadIdx.x,
+             blockIdx.x,
+             shared[fetch_index % max_resident_blocks],
+             &fetch_barrier);
+    cuda::memcpy_async(group,
+                       &shared[fetch_index % max_resident_blocks][shared_row_offset],
+                       &col_sizes[fetch_block.start_col],
+                       col_size_bytes,
+                       fetch_barrier);
+    shared_row_offset += col_size_bytes;
+    // copy the data for column offsets
+    if (debug_print)
+      printf("%d: offsets memcpy_async(group, %p, %p, %d, barrier);\n",
+             threadIdx.x,
+             &shared[fetch_index % max_resident_blocks][shared_row_offset],
+             &col_offsets[fetch_block.start_col],
+             col_offset_bytes);
+    cuda::memcpy_async(group,
+                       &shared[fetch_index % max_resident_blocks][shared_row_offset],
+                       &col_offsets[fetch_block.start_col],
+                       col_offset_bytes,
+                       fetch_barrier);
+    shared_row_offset += col_offset_bytes;
+    shared_row_offset = align_offset(shared_row_offset, 8);
+
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0 && fetch_block.start_col == 0 &&
+        fetch_block.start_row <= 51 && fetch_block.end_row >= 51) {
+      printf("Input data for col 0 row 51 is 0x");
+      for (int i = 0; i < col_sizes[0]; ++i) {
+        printf("%x ", input_data[row_offsets[51] + col_offsets[0] + i]);
       }
-      *reinterpret_cast<int64_t *>(output_ptr) = *long_col_input;
+      printf("\n");
+      printf(
+        "this is at offset %d-%d and starting column offset is %d and we're reading %d bytes\n",
+        col_offsets[0],
+        col_offsets[0] + col_sizes[0],
+        starting_col_offset,
+        fetch_block_row_size);
+      auto shared_offset = (51 - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+      printf("destination is %p", &shared[fetch_index % max_resident_blocks][shared_offset]);
+    }
+
+    for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
+         row <= fetch_block_end_row;
+         row += blockDim.x) {
+      auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+      if (debug_print)
+        printf("fetching block %lu to shared chunk %lu. %p <- %p\n",
+               fetch_index,
+               fetch_index % max_resident_blocks,
+               &shared[fetch_index % max_resident_blocks][shared_offset],
+               &input_data[row_offsets[row] + starting_col_offset]);
+      // copy the main
+      cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
+                         &input_data[row_offsets[row] + starting_col_offset],
+                         fetch_block_row_size,
+                         fetch_barrier);
     }
   }
 }
@@ -600,7 +998,7 @@ __global__ void copy_from_columns(const size_type num_rows,
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
+ * @param row_offsets
  * @param output_data
  * @param output_nm
  * @param col_sizes array of sizes for each element in a column - one per column
@@ -612,13 +1010,13 @@ __global__ void copy_from_columns(const size_type num_rows,
 __global__ void copy_to_columns(const size_type num_rows,
                                 const size_type num_columns,
                                 const size_type shmem_used_per_block,
-                                const size_type *offsets,
-                                int8_t **output_data,
-                                cudf::bitmask_type **output_nm,
-                                const size_type *col_sizes,
-                                const size_type *col_offsets,
-                                const block_info *block_infos,
-                                const int8_t *input_data)
+                                const size_type* row_offsets,
+                                int8_t** output_data,
+                                const size_type* _col_sizes,
+                                const size_type* _col_offsets,
+                                const block_info* block_infos,
+                                const size_type num_block_infos,
+                                const int8_t* input_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -628,7 +1026,14 @@ __global__ void copy_to_columns(const size_type num_rows,
   // This has been broken up for us in the block_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0;
+  // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
+  // to shared memory for each of the blocks that we work on
+
+  /*constexpr*/ bool debug_print  = false;  // threadIdx.x == 0;
+  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  auto group                      = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -638,189 +1043,387 @@ __global__ void copy_to_columns(const size_type num_rows,
     printf("%d: %d\n", i, row_offsets[i]);
     }*/
     printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+    printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
+    printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
+    printf("group is %d threads\n", group.size());
   }
-//  else { return; }
+  //  else { return; }
 
-  for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) {
-    auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset;
-    if (this_block_index > blockDim.x) {
-      break;
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&block_barrier[i], group.size());
     }
-    auto block               = block_infos[this_block_index];
-  auto const rows_in_block = block.end_row - block.start_row + 1;
-  auto const cols_in_block = block.end_col - block.start_col + 1;
-  extern __shared__ int8_t shared_data[];
+  }
 
-  // copy data from our block's window to shared memory
-  // offsets information can get us on the row, then we need to know where the column
-  // starts to offset into the row data.
-
-  // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding
-  // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the
-  // thread may copy less than 8 bytes. Even if at the beginning of the window, because
-  // every internal copy is aligned to 8-byte boundaries.
-  //
-  //  thread 0 thread 1 thread 2 thread 3 thread 4 thread 5
-  //  01234567 89abcdef 01234567 89abcdef 01234567 89abcdef
-  //  xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx
-  // |        |        |        |        |        |        |
-  //
-  //
-
-  auto const window_start_quad = col_offsets[block.start_col] / 8;
-  auto const window_end_quad   = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8;
-  auto const window_quad_width = window_end_quad - window_start_quad;
-  auto const total_quads       = window_quad_width * rows_in_block;
-  auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7;
+  group.sync();
 
-  if (debug_print) {
-    printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]);
-    printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad);
-    printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad);
-  }
+  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
+                                   (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
 
-  // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't
-  // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will
-  // result in the window edges being duplicated across blocks, but we can copy the padding as well
-  // to speed up our transfers to shared memory.
-  for (int i = threadIdx.x; i < total_quads; i += blockDim.x) {
-    auto const relative_row = i / window_quad_width;
-    auto const absolute_row = relative_row + block.start_row;
-    //auto const row           = i / window_quad_width;
-    auto const offset_in_row = i % window_quad_width * 8;
-    auto const shmem_dest    = &shared_data[i * 8];
-
-    if (debug_print) {
-      printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest);
-      printf("offsets is %p\n", offsets);
-      printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]);
-      printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row);
-    }
+  auto get_admin_data_sizes = [col_size_size   = sizeof(decltype(*_col_sizes)),
+                               col_offset_size = sizeof(decltype(*_col_offsets))](
+                                int const num_cols,
+                                int const num_rows) -> std::tuple<size_type, size_type> {
+    auto const col_size_bytes   = num_cols * col_size_size;
+    auto const col_offset_bytes = num_cols * col_offset_size;
 
-    // full 8-byte copy
-    const int64_t *long_col_input =
-      reinterpret_cast<const int64_t *>(&input_data[offsets[absolute_row] + offset_in_row]);
-    if (debug_print) { 
-      printf("which will be address %p\n", long_col_input);
-      printf("%p <- long %lu\n", shmem_dest, *long_col_input); }
-    *reinterpret_cast<int64_t *>(shmem_dest) = *long_col_input;
-  }
+    return {col_size_bytes, col_offset_bytes};
+  };
 
-  __syncthreads();
-
-  // now we copy from shared memory to final destination.
-  // the data is laid out in rows in shared memory, so the reads
-  // for a column will be "vertical". Because of this and the different
-  // sizes for each column, this portion is handled on row/column basis.
-  // to prevent each thread working on a single row and also to ensure
-  // that all threads can do work in the case of more threads than rows,
-  // we do a global index instead of a double for loop with col/row.
-  for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
-    auto const relative_col = index % cols_in_block;
-    auto const relative_row = index / cols_in_block;
-    auto const absolute_col = relative_col + block.start_col;
-    auto const absolute_row = relative_row + block.start_row;
-
-    auto const shared_memory_row_offset = window_quad_width * 8 * relative_row;
-    auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] +
-                                      shared_memory_row_offset + shared_memory_starting_pad;
-    auto const column_size = col_sizes[absolute_col];
-
-    int8_t *shmem_src = &shared_data[shared_memory_offset];
-    int8_t *dst       = &output_data[absolute_col][absolute_row * column_size];
-
-    if (debug_print) {
-      printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
-      " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size,
-    shmem_src, dst) ;
-    }
-    switch (column_size) {
-      case 1: {
-        if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); }
-        *dst = *shmem_src;
-        break;
-      }
-      case 2: {
-        const int16_t *short_col_input = reinterpret_cast<const int16_t *>(shmem_src);
-        if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); }
-        *reinterpret_cast<int16_t *>(dst) = *short_col_input;
-        break;
-      }
-      case 4: {
-        const int32_t *int_col_input = reinterpret_cast<const int32_t *>(shmem_src);
-        if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); }
-        *reinterpret_cast<int32_t *>(dst) = *int_col_input;
-        break;
-      }
-      case 8: {
-        const int64_t *long_col_input = reinterpret_cast<const int64_t *>(shmem_src);
-        if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); }
-        *reinterpret_cast<int64_t *>(dst) = *long_col_input;
-        break;
+  if (debug_print)
+    printf("%d blocks remaining -> %d block infos, %d block index\n",
+           blocks_remaining,
+           num_block_infos,
+           blockIdx.x);
+  size_t fetch;
+  size_t subset;
+  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+    // Fetch ahead up to stages_count subsets
+    fetch_blocks_for_row_to_column(fetch,
+                                   subset,
+                                   stages_count,
+                                   stages_count,
+                                   blocks_remaining,
+                                   block_infos,
+                                   _col_sizes,
+                                   _col_offsets,
+                                   row_offsets,
+                                   input_data,
+                                   shared,
+                                   group,
+                                   block_barrier,
+                                   debug_print);
+
+    auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
+    // ensure our data is ready
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+      printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
+    subset_barrier.arrive_and_wait();
+
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+      printf("%d-%d reading block %lu at address %p\n",
+             threadIdx.x,
+             blockIdx.x,
+             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset,
+             shared[subset % stages_count]);
+
+    auto const rows_in_block = block.num_rows();
+    auto const cols_in_block = block.num_cols();
+
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block);
+    // auto shared_row_offsets = shared[subset];
+    auto shared_col_sizes = reinterpret_cast<size_type*>(shared[subset % stages_count]);
+    auto shared_col_offsets =
+      reinterpret_cast<size_type*>(&shared[subset % stages_count][col_size_bytes]);
+
+    auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
+
+    auto block_row_size = block.get_row_size(_col_offsets, _col_sizes, debug_print);
+
+    // now we copy from shared memory to final destination.
+    // the data is laid out in rows in shared memory, so the reads
+    // for a column will be "vertical". Because of this and the different
+    // sizes for each column, this portion is handled on row/column basis.
+    // to prevent each thread working on a single row and also to ensure
+    // that all threads can do work in the case of more threads than rows,
+    // we do a global index instead of a double for loop with col/row.
+    for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+      auto const relative_col = index % cols_in_block;
+      auto const relative_row = index / cols_in_block;
+      auto const absolute_col = relative_col + block.start_col;
+      auto const absolute_row = relative_row + block.start_row;
+
+      if (debug_print)
+        printf("copying for row %d(%d absolute) col %d(%d absolute)\n",
+               relative_row,
+               absolute_row,
+               relative_col,
+               absolute_col);
+
+      auto const shared_memory_row_offset = block_row_size * relative_row;
+      if (debug_print)
+        printf("shared_col_offsets is %p and relative col is %d, making me access %p\n",
+               shared_col_offsets,
+               relative_col,
+               &shared_col_offsets[relative_col]);
+      auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
+                                        shared_memory_row_offset + shared_row_offset;
+      if (debug_print)
+        printf("shared_col_sizes is %p and relative col is %d, making me access %p\n",
+               shared_col_sizes,
+               relative_col,
+               &shared_col_sizes[relative_col]);
+      auto const column_size = shared_col_sizes[relative_col];
+
+      int8_t* shmem_src = &shared[subset % stages_count][shared_memory_offset];
+      int8_t* dst       = &output_data[absolute_col][absolute_row * column_size];
+
+      if (debug_print) {
+        printf(
+          "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, "
+          "shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
+          " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n",
+          relative_col,
+          relative_row,
+          absolute_col,
+          absolute_row,
+          shared_memory_row_offset,
+          shared_memory_offset,
+          column_size,
+          shmem_src,
+          dst/*,
+          *reinterpret_cast<uint32_t*>(shmem_src)*/);
+        printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size);
       }
-      default: {
-        if (debug_print) {
-          printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col);
+      if (debug_print && absolute_col == 0 && absolute_row == 51) {
+        printf("col0row51(%d bytes) = %p - 0x", column_size, shmem_src);
+        for (int i = 0; i < column_size; ++i) {
+          printf("%x ", shmem_src[i]);
         }
-        // TODO this should just not be supported for fixed width columns, but just in case...
-        for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; }
-        break;
+        printf("\n");
       }
+
+      cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
     }
+    group.sync();
+    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+      printf(
+        "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+    block_barrier[i].arrive_and_wait();
   }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param offsets
+ * @param output_nm
+ * @param validity_offsets offset into input data row for validity data
+ * @param block_infos information about the blocks of work
+ * @param num_block_infos number of infos in blocks array
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_validity_to_columns(const size_type num_rows,
+                                         const size_type num_columns,
+                                         const size_type shmem_used_per_block,
+                                         const size_type* row_offsets,
+                                         cudf::bitmask_type** output_nm,
+                                         const size_type validity_offset,
+                                         const block_info* block_infos,
+                                         const size_type num_block_infos,
+                                         const int8_t* input_data)
+{
+  extern __shared__ int8_t shared_data[];
+  int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+    shared_data, shared_data + shmem_used_per_block / 2};
 
-  // now handle validity. Each thread is responsible for 32 rows in 8 columns.
-  // to prevent indexing issues with a large number of threads, this is compressed
-  // to a single loop like above. TODO: investigate using shared memory here
-  auto const validity_batches_per_col = (num_rows + 31) / 32;
-  auto const validity_batches_total   = std::max(1, validity_batches_per_col * (num_columns / 8));
-  if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) {
-    printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x);
+  bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
+  // bool print_debug = false;
+  //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
+  if (print_debug) {
+    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
+    printf("%d %d - block infos are at %p and my index is %d\n",
+           threadIdx.x,
+           blockIdx.x,
+           block_infos,
+           blockIdx.x);
+    printf(
+      "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, row "
+      "offsets are %p, block infos at %p\n",
+      threadIdx.x,
+      blockIdx.x,
+      shared_data,
+      shared_data + shmem_used_per_block,
+      input_data,
+      output_nm,
+      row_offsets,
+      block_infos);
+    /*    printf("Row Offsets:\n");
+    for (int i=0; i<num_rows; ++i) {
+    printf("%d: %d\n", i, row_offsets[i]);
+    }*/
   }
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) {
-    auto const start_col       = (index * 8) / validity_batches_per_col;
-    auto const batch           = index % validity_batches_per_col;
-    auto const starting_row    = batch * 32;
-    auto const validity_offset = col_offsets[num_columns] + (start_col / 8);
-
-    if (debug_print) {
-      printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x);
+  // else { return; }
+
+  // per conversation with DaveB
+  // each thread of warp reads a single byte of validity - so we read 32 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const blocks_remaining =
+    std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+             (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+
+  if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size());
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+    shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&shared_block_barriers[i], group.size());
     }
+  }
 
-    // one for each column
-    int32_t dst_validity[8] = {0};
-    for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) {
-      int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset];
+  group.sync();
 
-      if (debug_print) {
-        printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row);
-      }
-  
-      auto const val_byte     = *validity_ptr;
-
-      for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
-        auto const src_shift    = (start_col + i) % 8;
-        auto const dst_shift    = row % 32;
-        auto const src_bit_mask = 1 << src_shift;
-        if (debug_print) {
-          printf("%d-%d: src bit mask is 0x%x, src shift is 0x%x and dst shift is 0x%x, validity bit is 0x%x\n", threadIdx.x, blockIdx.x, src_bit_mask, src_shift, dst_shift, (val_byte & src_bit_mask) >> src_shift);
+  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+    auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+    if (validity_block != validity_index) {
+      shared_block_barriers[validity_index].arrive_and_wait();
+    }
+    int8_t* this_shared_block = shared_blocks[validity_block % 2];
+    auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+    auto const block_start_col = block.start_col;
+    auto const block_start_row = block.start_row;
+
+    auto const num_block_cols = block.num_cols();
+    auto const num_block_rows = block.num_rows();
+
+    auto const num_sections_x           = (num_block_cols + 7) / 8;
+    auto const num_sections_y           = (num_block_rows + 31) / 32;
+    auto const validity_data_col_length = align_offset(num_sections_y, 4);
+    auto const total_sections           = num_sections_x * num_sections_y;
+
+    if (print_debug) {
+      printf("%d %d - block %d has %d cols, %d rows, and %d total sections\n",
+             threadIdx.x,
+             blockIdx.x,
+             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
+             num_block_cols,
+             num_block_rows,
+             total_sections);
+    }
+    int const warp_id          = threadIdx.x / detail::warp_size;
+    int const lane_id          = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+
+    if (print_debug)
+      printf(
+        "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side "
+        "%d\n",
+        threadIdx.x,
+        blockIdx.x,
+        warp_id,
+        total_sections,
+        warps_per_block,
+        blockDim.x,
+        detail::warp_size);
+    // the block is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_block) {
+      // convert to rows and cols
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
+
+      auto const relative_col = section_x * 8;
+      auto const relative_row = section_y * 32 + lane_id;
+      auto const absolute_col = relative_col + block_start_col;
+      auto const absolute_row = relative_row + block_start_row;
+      auto const rows_left    = num_rows - absolute_row;
+
+      if (print_debug)
+        printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
+               threadIdx.x,
+               blockIdx.x,
+               my_section_idx,
+               num_sections_x,
+               num_sections_y,
+               section_x,
+               section_y,
+               absolute_row,
+               num_rows,
+               relative_col,
+               relative_row);
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
+
+      if (absolute_row < num_rows) {
+        auto const my_byte =
+          input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
+
+        // so every thread that is participating in the warp has a byte, but it's row-based
+        // data and we need it in column-based. So we shiffle the bits around to make
+        // the bytes we actually write.
+        for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
+             ++i, byte_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+          // lead thread in each warp writes data
+          if (threadIdx.x % detail::warp_size == 0) {
+            auto const validity_write_offset =
+              validity_data_col_length * (relative_col + i) + relative_row / 8;
+
+            if (print_debug)
+              printf("%d - Writing validity data 0x%x to shared memory location %d\n",
+                     threadIdx.x,
+                     validity_data,
+                     validity_write_offset);
+            if (rows_left <= 8) {
+              // write byte
+              this_shared_block[validity_write_offset] = validity_data & 0xFF;
+            } else if (rows_left <= 16) {
+              // write int16
+              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data & 0xFFFF;
+            } else if (rows_left <= 24) {
+              // write int16 and then int8
+              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data & 0xFFFF;
+              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+            } else {
+              // write int32
+              *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
+                validity_data;
+            }
+          }
         }
-  //      auto const dst_bit_mask = 1 << dst_shift;
-        dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift);
       }
     }
-    
 
-    for (int i=0; i<std::min(num_columns - start_col, 8); ++i) {
-      int32_t *validity_ptr = reinterpret_cast<int32_t *>(output_nm[start_col + i] + (starting_row / 32));
-      if (debug_print) {
-        printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]);
-      }
-      *validity_ptr         = dst_validity[i];
+    // make sure entire block has finished copy
+    group.sync();
+
+    // now async memcpy the shared
+    for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
+      auto const relative_col     = col - block.start_col;
+      auto const words_to_copy    = util::div_rounding_up_unsafe(num_block_rows, 32);
+      auto const starting_address = output_nm[col] + word_index(block_start_row);
+
+      if (print_debug)
+        printf("memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
+               starting_address,
+               word_index(block_start_row),
+               &this_shared_block[validity_data_col_length * relative_col],
+               words_to_copy * 4);
+      cuda::memcpy_async(
+        output_nm[col] + word_index(block_start_row),
+        &this_shared_block[validity_data_col_length * relative_col],
+        util::div_rounding_up_unsafe(num_block_rows, 8),
+        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
     }
   }
+
+  //  if (print_debug) printf("leaving...\n");
+  // wait for last blocks of data to arrive
+  auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
+                                    ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
+                                    : blocks_remaining;
+  for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
+    shared_block_barriers[validity_block].arrive_and_wait();
+  }
 }
-}
+
+#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
@@ -834,8 +1437,8 @@ __global__ void copy_to_columns(const size_type num_rows,
 static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
                                         const cudf::size_type num_rows,
                                         const cudf::size_type size_per_row,
-                                        dim3 &blocks,
-                                        dim3 &threads)
+                                        dim3& blocks,
+                                        dim3& threads)
 {
   // We have found speed degrades when a thread handles more than 4 columns.
   // Each block is 2 dimensional. The y dimension indicates the columns.
@@ -846,7 +1449,7 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
   // in the x dimension because we use atomic operations at the block
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
-  int y_block_size = (num_columns + 3) / 4;
+  int y_block_size = (num_columns + 3) / 4;  // cudf::util::div_rounding_up_safe(num_columns, 4);
   if (y_block_size > 32) { y_block_size = 32; }
   int x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
@@ -895,14 +1498,14 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   const cudf::size_type num_rows,
   const cudf::size_type num_columns,
   const cudf::size_type size_per_row,
-  rmm::device_uvector<cudf::size_type> &column_start,
-  rmm::device_uvector<cudf::size_type> &column_size,
-  rmm::device_uvector<const int8_t *> &input_data,
-  rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
-  const cudf::scalar &zero,
-  const cudf::scalar &scalar_size_per_row,
+  rmm::device_uvector<cudf::size_type>& column_start,
+  rmm::device_uvector<cudf::size_type>& column_size,
+  rmm::device_uvector<const int8_t*>& input_data,
+  rmm::device_uvector<const cudf::bitmask_type*>& input_nm,
+  const cudf::scalar& zero,
+  const cudf::scalar& scalar_size_per_row,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
+  rmm::mr::device_memory_resource* mr)
 {
   int64_t total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
@@ -944,12 +1547,12 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
                                  mr);
 }
 
-static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); }
+static cudf::data_type get_data_type(const cudf::column_view& v) { return v.type(); }
 
-static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema)
+static inline bool are_all_fixed_width(std::vector<cudf::data_type> const& schema)
 {
   return std::all_of(
-    schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
+    schema.begin(), schema.end(), [](const cudf::data_type& t) { return cudf::is_fixed_width(t); });
 }
 
 /**
@@ -959,9 +1562,9 @@ static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schem
  * @param [out] column_size the size in bytes of the data for each columns in the row.
  * @return the size in bytes each row needs.
  */
-static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
-                                                 std::vector<cudf::size_type> &column_start,
-                                                 std::vector<cudf::size_type> &column_size)
+static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const& schema,
+                                                 std::vector<cudf::size_type>& column_start,
+                                                 std::vector<cudf::size_type>& column_size)
 {
   // We guarantee that the start of each column is 64-bit aligned so anything can go
   // there, but to make the code simple we will still do an alignment for it.
@@ -979,27 +1582,29 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  int32_t validity_bytes_needed =
+    (schema.size() + 7) / 8;  // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
   return align_offset(at_offset, 8);  // 8 bytes (64 bits)
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
 template <typename iterator>
-static size_type compute_column_information(
-  iterator begin,
-  iterator end,
-  std::vector<size_type> &column_starts,
-  std::vector<size_type> &column_sizes)//,
-  //std::function<void(T)> nested_type_cb)
+static size_type compute_column_information(iterator begin,
+                                            iterator end,
+                                            std::vector<size_type>& column_starts,
+                                            std::vector<size_type>& column_sizes)  //,
+// std::function<void(T)> nested_type_cb)
 {
   size_type fixed_width_size_per_row = 0;
   for (auto cv = begin; cv != end; ++cv) {
     auto col_type    = std::get<0>(*cv);
     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
-//    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+    //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
 
     // a list or string column will write a single uint64
     // of data here for offset/length
@@ -1021,11 +1626,53 @@ static size_type compute_column_information(
 
 //#define DEBUG
 
-static std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
-                                                 std::vector<size_type> const &column_starts,
-                                                 std::vector<row_batch> const &row_batches,
-                                                 size_type const total_number_of_rows,
-                                                 size_type const &shmem_limit_per_block)
+std::vector<detail::block_info> build_validity_block_infos(
+  size_type const& num_columns,
+  size_type const& num_rows,
+  size_type const& shmem_limit_per_block,
+  std::vector<row_batch> const& row_batches)
+{
+  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+  auto const column_stride            = align_offset(
+    [&]() {
+      if (desired_rows_and_columns > num_columns) {
+        // not many columns, group it into 8s and ship it off
+        return std::min(8, num_columns);
+      } else {
+        return util::round_down_safe(desired_rows_and_columns, 8);
+      }
+    }(),
+    8);
+  // we fit as much as we can given the column stride
+  auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride);
+
+  std::vector<detail::block_info> validity_block_infos;
+  for (int col = 0; col < num_columns; col += column_stride) {
+    int current_window_row_batch = 0;
+    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
+    int row                      = 0;
+    while (row < num_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(row_stride, rows_left_in_batch);
+
+      validity_block_infos.emplace_back(detail::block_info{
+        col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
+      row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  }
+
+  return validity_block_infos;
+}
+
+std::vector<block_info> build_block_infos(std::vector<size_type> const& column_sizes,
+                                          std::vector<size_type> const& column_starts,
+                                          std::vector<row_batch> const& row_batches,
+                                          size_type const total_number_of_rows,
+                                          size_type const& shmem_limit_per_block)
 {
   std::vector<block_info> block_infos;
 
@@ -1067,19 +1714,37 @@ static std::vector<block_info> build_block_infos(std::vector<size_type> const &c
   // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
   // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
   // bytes, not rows or columns.
-  int const window_height = std::min(
-    std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows),
-    row_batches[0].row_count);
+  size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
+  int const window_height =
+    std::clamp(util::round_up_safe<int>(
+                 optimal_square_len <= (size_type)column_sizes.size()
+                   ? std::min(optimal_square_len / column_sizes[0], total_number_of_rows)
+                   : row_batches[0].row_count / 2,
+                 32),
+               1,
+               row_batches[0].row_count);
 #if defined(DEBUG)
   printf(
-    "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height "
-    "%d\n",
-    size_type(sqrt(shmem_limit_per_block)) / column_sizes[0],
+    "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d "
+    "and num_rows is %d, batch row count is %d "
+    "- which makes window height "
+    "%d - admin size is %lu\n",
+    optimal_square_len,
+    (int)column_sizes.size(),
+    optimal_square_len / column_sizes[0],
     total_number_of_rows,
     row_batches[0].row_count,
-    window_height);
+    window_height,
+    column_sizes.size() * sizeof(size_type) * 2);
 #endif
 
+  auto calc_admin_data_size = [](int num_cols) -> size_type {
+    // admin data is the column sizes and column start information.
+    // this is copied to shared memory as well and needs to be accounted for
+    // in the window calculation.
+    return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
+  };
+
   int row_size = 0;
 
   // march each column and build the blocks of appropriate sizes
@@ -1092,14 +1757,26 @@ static std::vector<block_info> build_block_infos(std::vector<size_type> const &c
     auto row_size_with_this_col  = row_size_aligned + col_size;
     auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
 
-    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
+    if (row_size_with_end_pad * window_height +
+          calc_admin_data_size(col - current_window_start_col) >
+        shmem_limit_per_block) {
 #if defined(DEBUG)
       printf(
-        "Window size %d too large at column %d, bumping back to build windows of size %d(cols "
+        "row size with end pad is %d and admin data is %d, which adds up to %d and that is too "
+        "large for shmem block of %d\n",
+        row_size_with_end_pad,
+        calc_admin_data_size(col - current_window_start_col),
+        row_size_with_end_pad * window_height +
+          calc_admin_data_size(col - current_window_start_col),
+        shmem_limit_per_block);
+      printf(
+        "Window size %d too large at column %d, admin size is %d, bumping back to build windows of "
+        "size %d(cols "
         "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
         "for shared mem size %d\n",
         row_size_with_end_pad * window_height,
         col,
+        calc_admin_data_size(col - current_window_start_col),
         row_size * window_height,
         current_window_start_col,
         col - 1,
@@ -1136,31 +1813,35 @@ static std::vector<block_info> build_block_infos(std::vector<size_type> const &c
 
   // build last set of blocks
   if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height);
+    build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
   }
 
   return block_infos;
 }
-}  // namespace detail
 
 #if defined(DEBUG)
-  void pretty_print(uint64_t i) {
-    if (i > (1 * 1024 * 1024 * 1024)) {
-      printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
-    } else if (i > (1 * 1024 * 1024)) {
-      printf("%.2f MB", i / float(1 * 1024 * 1024));
-    } else if (i > (1 * 1024)) {
-      printf("%.2f KB", float(i / 1024));
-    } else {
-      printf("%lu Bytes", i);
-    }
+void pretty_print(uint64_t i)
+{
+  if (i > (1 * 1024 * 1024 * 1024)) {
+    printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
+  } else if (i > (1 * 1024 * 1024)) {
+    printf("%.2f MB", i / float(1 * 1024 * 1024));
+  } else if (i > (1 * 1024)) {
+    printf("%.2f KB", float(i / 1024));
+  } else {
+    printf("%lu Bytes", i);
   }
+}
 #endif
+#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+}  // namespace detail
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr)
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const& tbl,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr)
 {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
   // Potential optimization for window sizes.
@@ -1169,9 +1850,13 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
-  int shmem_limit_per_block;
-  CUDA_TRY(
-    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+  int total_shmem;
+  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+#if defined(DEBUG) || 1
+  total_shmem -= 1024;
+#endif
+  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
 
 #if defined(DEBUG)
   size_t free, total;
@@ -1195,8 +1880,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   // windows so the windows can be properly cut around them.
 
   // Get the pointers to the input columnar data ready
-  std::vector<const int8_t *> input_data;
-  std::vector<bitmask_type const *> input_nm;
+  std::vector<int8_t const*> input_data;
+  std::vector<bitmask_type const*> input_nm;
   input_data.reserve(num_columns);
   input_nm.reserve(num_columns);
   for (size_type column_number = 0; column_number < num_columns; column_number++) {
@@ -1224,16 +1909,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   column_sizes.reserve(num_columns);
   column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
 
-  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-    return std::make_tuple(tbl.column(i).type(), tbl.column(i));
-  });
+  auto iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+      return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+    });
 
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter,
-    iter + num_columns,
-    column_starts,
-    column_sizes);//,
-//    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
+  size_type fixed_width_size_per_row = detail::compute_column_information(iter,
+                                                                          iter + num_columns,
+                                                                          column_starts,
+                                                                          column_sizes);  //,
+  //    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
   /*  size_type fixed_width_size_per_row = 0;
     for (int col = 0; col < num_columns; ++col) {
       auto cv          = tbl.column(col);
@@ -1261,7 +1946,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
          column_starts.back() + column_sizes.back());
 #endif
 
-
   auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
@@ -1329,7 +2013,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
     row_batch_rows++;
   }
   if (row_batch_size > 0) {
-    row_batches.push_back(detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+    row_batches.push_back(
+      detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
   }
 
   auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
@@ -1339,17 +2024,17 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   printf("%lu batches:\n", row_batches.size());
   for (auto i = 0; i < (int)row_batches.size(); ++i) {
     printf("%d: %d rows, ", i, row_batches[i].row_count);
-    pretty_print(row_batches[i].num_bytes);
+    detail::pretty_print(row_batches[i].num_bytes);
     printf("\n");
   }
 #endif
 
   std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
+  std::vector<int8_t*> output_data;
   output_data.reserve(row_batches.size());
   for (uint i = 0; i < row_batches.size(); ++i) {
     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_data.push_back(static_cast<int8_t*>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
@@ -1362,38 +2047,63 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
          block_infos.size(),
          block_infos[0].end_col - block_infos[0].start_col + 1,
          block_infos[0].end_row - block_infos[0].start_row);
-  pretty_print(shmem_limit_per_block);
+  detail::pretty_print(shmem_limit_per_block);
   printf(" shared mem(");
-  pretty_print(fixed_width_size_per_row);
+  detail::pretty_print(fixed_width_size_per_row);
   printf("/row, %d columns, %d rows, ", num_columns, num_rows);
-  pretty_print(total_table_size);
+  detail::pretty_print(total_table_size);
   printf(" total):\n");
 #endif
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks(block_infos.size());
-  #if defined(DEBUG) || 1
-  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size));
-  #else
-  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size));
-  #endif
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS));
+  dim3 threads(256);
+
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  pretty_print(shmem_limit_per_block);
+  detail::pretty_print(shmem_limit_per_block);
   printf(" shared memory\n");
 #endif
-  copy_from_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+  detail::copy_from_columns<<<blocks, threads, total_shmem, stream.value()>>>(
     num_rows,
     num_columns,
+    shmem_limit_per_block,
+    block_infos.size(),
     dev_input_data.data(),
-    dev_input_nm.data(),
     dev_col_sizes.data(),
     dev_col_starts.data(),
     dev_block_infos.data(),
     dev_row_offsets.data(),
-    reinterpret_cast<int8_t **>(dev_output_data.data()));
+    reinterpret_cast<int8_t**>(dev_output_data.data()));
+
+  auto validity_block_infos =
+    build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  dim3 validity_blocks(
+    util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+#if defined(DEBUG)
+  printf("Launching validity kernel with %d blocks, for %lu validity blocks with %d threads, ",
+         validity_blocks.x,
+         validity_block_infos.size(),
+         validity_threads.x);
+  detail::pretty_print(total_shmem);
+  printf(" shared memory\n");
+#endif
+  detail::
+    copy_validity_from_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+      num_rows,
+      num_columns,
+      shmem_limit_per_block,
+      dev_row_offsets.data(),
+      dev_output_data.data(),
+      column_starts.back(),
+      dev_validity_block_infos.data(),
+      validity_block_infos.size(),
+      dev_input_nm.data());
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
@@ -1428,11 +2138,15 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows2(cudf::table_view con
   }
 
   return ret;
+#else
+  CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
+  return {};
+#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource *mr)
+std::vector<std::unique_ptr<cudf::column>> old_convert_to_rows(cudf::table_view const& tbl,
+                                                               rmm::cuda_stream_view stream,
+                                                               rmm::mr::device_memory_resource* mr)
 {
   const cudf::size_type num_columns = tbl.num_columns();
 
@@ -1456,8 +2170,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     cudf::size_type num_rows = tbl.num_rows();
 
     // Get the pointers to the input columnar data ready
-    std::vector<const int8_t *> input_data;
-    std::vector<cudf::bitmask_type const *> input_nm;
+    std::vector<const int8_t*> input_data;
+    std::vector<cudf::bitmask_type const*> input_nm;
     for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
       cudf::column_view cv = tbl.column(column_number);
       input_data.emplace_back(cv.data<int8_t>());
@@ -1469,11 +2183,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
     zero->set_valid_async(true, stream);
-    static_cast<ScalarType *>(zero.get())->set_value(0, stream);
+    static_cast<ScalarType*>(zero.get())->set_value(0, stream);
 
     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
     step->set_valid_async(true, stream);
-    static_cast<ScalarType *>(step.get())
+    static_cast<ScalarType*>(step.get())
       ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
 
     std::vector<std::unique_ptr<cudf::column>> ret;
@@ -1500,11 +2214,12 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &input,
-                                                std::vector<cudf::data_type> const &schema,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource *mr)
+std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& input,
+                                               std::vector<cudf::data_type> const& schema,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
 {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
   // verify that the types are what we expect
   cudf::column_view child = input.child();
   cudf::type_id list_type = child.type().id();
@@ -1516,11 +2231,13 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
-  int shmem_limit_per_block;
-  CUDA_TRY(
-    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+  int total_shmem;
+  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS;
+#if defined(DEBUG) || 1
+  total_shmem -= 1024;
+#endif
+  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
 
   std::vector<cudf::size_type> column_starts;
   std::vector<cudf::size_type> column_sizes;
@@ -1529,7 +2246,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
     return std::make_tuple(schema[i], nullptr);
   });
   size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {});
+    iter, iter + num_columns, column_starts, column_sizes);  //, [](void *) {});
 
   size_type validity_size = num_bitmask_words(num_columns) * 4;
 
@@ -1537,8 +2254,7 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   // Ideally we would check that the offsets are all the same, etc. but for now
   // this is probably fine
-  CUDF_EXPECTS(row_size * num_rows == child.size(),
-               "The layout of the data appears to be off");
+  CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
   auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
 
@@ -1549,8 +2265,8 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<cudf::column>> output_columns;
-  std::vector<int8_t *> output_data;
-  std::vector<cudf::bitmask_type *> output_nm;
+  std::vector<int8_t*> output_data;
+  std::vector<cudf::bitmask_type*> output_nm;
   for (cudf::size_type i = 0; i < num_columns; i++) {
     auto column = cudf::make_fixed_width_column(
       schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
@@ -1568,36 +2284,97 @@ std::unique_ptr<cudf::table> convert_from_rows2(cudf::lists_column_view const &i
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
-  dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
-  #if defined(DEBUG) || 1
-  dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size()));
-  #else
-  dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size()));
-  #endif
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+#if defined(DEBUG)
+  dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size()));
+#else
+  dim3 threads(std::min(256, (int)child.size()));
+#endif
 #if defined(DEBUG)
   printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  pretty_print(shmem_limit_per_block);
+  detail::pretty_print(total_shmem);
   printf(" shared memory\n");
 #endif
-  detail::copy_to_columns<<<blocks, threads, shmem_limit_per_block, stream.value()>>>(
+  detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
     num_rows,
     num_columns,
     shmem_limit_per_block,
     input.offsets().data<size_type>(),
     dev_output_data.data(),
-    dev_output_nm.data(),
     dev_col_sizes.data(),
     dev_col_starts.data(),
     dev_block_infos.data(),
+    block_infos.size(),
     child.data<int8_t>());
 
+  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+  auto const column_stride            = [&]() {
+    if (desired_rows_and_columns > num_columns) {
+      // not many columns, group it into 8s and ship it off
+      return std::min(8, num_columns);
+    } else {
+      return util::round_down_safe(desired_rows_and_columns, 8);
+    }
+  }();
+  auto const row_stride = [&]() {
+    // we fit as much as we can, we know the column stride now, so calculate the row
+    return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
+    /*    if (desired_rows_and_columns > num_rows) {
+          return std::min(32, num_rows);
+        } else {
+          return util::round_down_safe(desired_rows_and_columns, 32);
+        }*/
+  }();
+  std::vector<detail::block_info> validity_block_infos;
+  for (int col = 0; col < num_columns; col += column_stride) {
+    for (int row = 0; row < num_rows; row += row_stride) {
+      validity_block_infos.emplace_back(
+        detail::block_info{col,
+                           row,
+                           std::min(col + column_stride - 1, num_columns - 1),
+                           std::min(row + row_stride - 1, num_rows - 1)});
+    }
+  }
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  dim3 validity_blocks(
+    util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+#if defined(DEBUG)
+  printf(
+    "Launching validity kernel with %d blocks, for %lu validity blocks, col stride %d and row "
+    "stride of %d with %d threads, ",
+    validity_blocks.x,
+    validity_block_infos.size(),
+    column_stride,
+    row_stride,
+    threads.x);
+  detail::pretty_print(total_shmem);
+  printf(" shared memory\n");
+#endif
+
+  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+  detail::
+    copy_validity_to_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+      num_rows,
+      num_columns,
+      shmem_limit_per_block,
+      input.offsets().data<size_type>(),
+      dev_output_nm.data(),
+      column_starts.back(),
+      dev_validity_block_infos.data(),
+      validity_block_infos.size(),
+      child.data<int8_t>());
+
   return std::make_unique<cudf::table>(std::move(output_columns));
+#else
+  CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
+  return {};
+#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
-                                               std::vector<cudf::data_type> const &schema,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource *mr)
+std::unique_ptr<cudf::table> old_convert_from_rows(cudf::lists_column_view const& input,
+                                                   std::vector<cudf::data_type> const& schema,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   // verify that the types are what we expect
   cudf::column_view child = input.child();
@@ -1619,12 +2396,12 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
     auto dev_column_start = make_device_uvector_async(column_start, stream);
-    auto dev_column_size = make_device_uvector_async(column_size, stream);
+    auto dev_column_size  = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
-    std::vector<int8_t *> output_data;
-    std::vector<cudf::bitmask_type *> output_nm;
+    std::vector<int8_t*> output_data;
+    std::vector<cudf::bitmask_type*> output_nm;
     for (cudf::size_type i = 0; i < num_columns; i++) {
       auto column = cudf::make_fixed_width_column(
         schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
@@ -1642,6 +2419,11 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     int shared_size =
       detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
+    //    printf("Launching (%d, %d, %d) blocks, (%d, %d, %d) threads, with %d shared size\n",
+    //    blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shared_size);
+    //    printf("pointers are column_start: %p, column_size: %p, output_data: %p, output_nm: %p\n",
+    //    dev_column_start.data(), dev_column_size.data(), dev_output_data.data(),
+    //    dev_output_nm.data());
     detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
       num_rows,
       num_columns,
@@ -1658,36 +2440,4 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
-{
-  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
-
-  //    for (uint i=0; i<input.size(); ++i) {
-  cudf::lists_column_view lcv = input[0]->view();
-  auto ret                    = convert_from_rows(lcv, schema, stream, mr);
-
-  return ret;
-  //    }
-}
-
-std::unique_ptr<cudf::table> convert_from_rows2(
-  std::vector<std::unique_ptr<cudf::column>> const &input,
-  std::vector<cudf::data_type> const &schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
-{
-  CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables...");
-
-  //    for (uint i=0; i<input.size(); ++i) {
-  cudf::lists_column_view lcv = input[0]->view();
-  auto ret                    = convert_from_rows2(lcv, schema, stream, mr);
-
-  return ret;
-  //    }
-}
-
 }  // namespace cudf
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 818d7a89ddb..e38b37e81a6 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -34,8 +34,8 @@ TEST_F(ColumnToRowTests, Single)
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -48,8 +48,8 @@ TEST_F(ColumnToRowTests, Simple)
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -64,8 +64,8 @@ TEST_F(ColumnToRowTests, Tall)
   cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -84,8 +84,8 @@ TEST_F(ColumnToRowTests, Wide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -104,8 +104,31 @@ TEST_F(ColumnToRowTests, SingleByteWide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::convert_to_rows(in);
-  auto new_rows = cudf::convert_to_rows2(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Big)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + 4096 * i, r + 4096 * i + 4096));
+    views.push_back(cols.back());
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); i++) {
@@ -120,9 +143,9 @@ TEST_F(RowToColumnTests, Single)
 
   auto old_rows = cudf::convert_to_rows(in);
   std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -133,11 +156,11 @@ TEST_F(RowToColumnTests, Simple)
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
   std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -150,15 +173,15 @@ TEST_F(RowToColumnTests, Tall)
   cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::convert_to_rows(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -170,21 +193,21 @@ TEST_F(RowToColumnTests, Wide)
   std::vector<cudf::column_view> views;
 
   for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({i}));  // rand()}));
     views.push_back(cols.back());
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::convert_to_rows(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
 
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -201,15 +224,64 @@ TEST_F(RowToColumnTests, SingleByteWide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::convert_to_rows(in);
+  auto old_rows = cudf::old_convert_to_rows(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
-  for (uint i=0; i<old_rows.size(); ++i) {
-    auto old_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows2(cudf::lists_column_view(*old_rows[i]), schema);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, non2power)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r, r + 13));
+  views.push_back(cols.back());
+  schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Big)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  for (int i = 0; i < 256; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + 4096 * i, r + 4096 * i + 4096));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 68f1ae93dec..1babbc6fd1a 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -14,36 +14,52 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <iostream>
+#include <iterator>
 #include <limits>
+#include <tuple>
+
+#include <cooperative_groups.h>
+#include <type_traits>
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#include <cuda/barrier>
+#endif
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-
-#include "row_conversion.hpp"
-
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
+constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
+constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
+#endif
+
+using cudf::detail::make_device_uvector_async;
 namespace cudf {
-namespace java {
 
-/**
- * Copy a simple vector to device memory asynchronously. Be sure to read
- * the data on the same stream as is used to copy it.
- */
-template <typename T>
-std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
-                                                          rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource *mr) {
-  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
-  CUDA_TRY(cudaMemcpyAsync(ret->data(), input.data(), sizeof(T) * input.size(),
-                           cudaMemcpyHostToDevice, stream.value()));
-  return ret;
+namespace detail {
+
+static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) {
+  return (offset + alignment - 1) & ~(alignment - 1);
 }
 
 __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
@@ -53,7 +69,6 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
                                             const cudf::size_type *num_bytes, int8_t **output_data,
                                             cudf::bitmask_type **output_nm,
                                             const int8_t *input_data) {
-
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -122,7 +137,6 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
       cudf::size_type col_index_stride = blockDim.y;
       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-
         cudf::size_type col_size = num_bytes[col_index];
         const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
         int8_t *col_output = output_data[col_index];
@@ -208,7 +222,6 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
 
   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
-
     // Within the row group there should be 1 thread for each row.  This is a
     // requirement for launching the kernel
     cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
@@ -220,7 +233,6 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
       cudf::size_type col_index_stride = blockDim.y;
       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-
         cudf::size_type col_size = num_bytes[col_index];
         int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
         const int8_t *col_input = input_data[col_index];
@@ -304,6 +316,630 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
   }
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+struct block_info {
+  int start_col;
+  int start_row;
+  int end_col;
+  int end_row;
+  int buffer_num;
+
+  __host__ __device__ size_type get_row_size(size_type const *const col_offsets,
+                                             size_type const *const col_sizes) const {
+    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
+  }
+  __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
+
+  __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
+};
+
+// When building the columns to return, we have to be mindful of the offset limit in cudf.
+// It is 32-bit and these data columns are capable of surpassing that easily. The data should
+// not be cut off exactly at the limit though due to the validity buffers. The most efficient
+// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+// we keep track of the cut points for the validity, which we call row batches. If the row
+// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+// the outgoing data stream.
+struct row_batch {
+  size_type num_bytes;
+  size_type row_count;
+};
+
+/**
+ * @brief copy data from cudf columns into x format, which is row-based
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param input_data pointer to raw table data
+ * @param input_nm pointer to validity data
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param row_offsets offset to a specific row in the input data
+ * @param output_data pointer to output data
+ *
+ */
+__global__ void copy_from_columns(const size_type num_rows, const size_type num_columns,
+                                  const size_type shmem_used_per_block,
+                                  const size_type num_block_infos, const int8_t **input_data,
+                                  const size_type *col_sizes, const size_type *col_offsets,
+                                  const block_info *block_infos, const size_type *row_offsets,
+                                  int8_t **output_data) {
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  auto group = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&block_barrier[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  auto const blocks_remaining =
+      std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS),
+               std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
+                        (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+
+  size_t fetch;
+  size_t subset;
+  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+    // Fetch ahead up to stages_count subsets
+    for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
+
+      auto const num_fetch_cols = fetch_block.num_cols();
+      auto const num_fetch_rows = fetch_block.num_rows();
+      auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
+      auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
+      auto const starting_column_offset = col_offsets[fetch_block.start_col];
+      auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
+
+      // wait for the last use of the memory to be completed
+      if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) {
+        fetch_barrier.arrive_and_wait();
+      }
+
+      // to do the copy we need to do n column copies followed by m element copies OR
+      // we have to do m element copies followed by r row copies. When going from column
+      // to row it is much easier to copy by elements first otherwise we would need a running
+      // total of the column sizes for our block, which isn't readily available. This makes it more
+      // appealing to copy element-wise from input data into shared matching the end layout and do
+      // row-based memcopies out.
+
+      for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
+        auto const relative_col = el / num_fetch_rows;
+        auto const relative_row = el % num_fetch_rows;
+        auto const absolute_col = relative_col + fetch_block.start_col;
+        auto const absolute_row = relative_row + fetch_block.start_row;
+        auto const col_size = col_sizes[absolute_col];
+        auto const col_offset = col_offsets[absolute_col];
+        auto const relative_col_offset = col_offset - starting_column_offset;
+
+        auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
+        auto const input_src = input_data[absolute_col] + col_size * absolute_row;
+
+        // copy the main
+        cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size,
+                           fetch_barrier);
+      }
+    }
+
+    auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
+    subset_barrier.arrive_and_wait();
+
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    /*    auto const rows_in_block  = block.num_rows();
+        auto const cols_in_block  = block.num_cols();*/
+    auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
+    auto const column_offset = col_offsets[block.start_col];
+
+    // copy entire rows to final dest
+    for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
+         absolute_row += blockDim.x) {
+      auto const relative_row = absolute_row - block.start_row;
+      auto const output_dest =
+          output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
+      auto const shared_offset = block_row_size * relative_row;
+      cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
+                         subset_barrier);
+    }
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+    block_barrier[i].arrive_and_wait();
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param offsets
+ * @param output_data pointer to output data, partitioned by data size
+ * @param validity_offsets offset into input data row for validity data
+ * @param block_infos information about the blocks of work
+ * @param num_block_infos number of infos in blocks array
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_validity_from_columns(
+    const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
+    const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
+    const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+      shared_data, shared_data + shmem_used_per_block / 2};
+
+  // per conversation with DaveB
+  // each thread of warp reads a single int32 of validity - so we read 128 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const blocks_remaining =
+      std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+               (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+      shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&shared_block_barriers[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+    if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+      shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
+          .arrive_and_wait();
+    }
+    int8_t *this_shared_block = shared_blocks[validity_block % 2];
+    auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+
+    auto const num_block_cols = block.num_cols();
+    auto const num_block_rows = block.num_rows();
+
+    auto const num_sections_x = (num_block_cols + 31) / 32;
+    auto const num_sections_y = (num_block_rows + 7) / 8;
+    auto const validity_data_row_length =
+        align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
+    auto const total_sections = num_sections_x * num_sections_y;
+
+    int const warp_id = threadIdx.x / detail::warp_size;
+    int const lane_id = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+
+    // the block is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_block) {
+      // convert to rows and cols
+      auto const section_x = my_section_idx / num_sections_x;
+      auto const section_y = my_section_idx % num_sections_x;
+
+      auto const relative_col = section_x * 32 + lane_id;
+      auto const relative_row = section_y * 8;
+      auto const absolute_col = relative_col + block.start_col;
+      auto const absolute_row = relative_row + block.start_row;
+      auto const cols_left = num_columns - absolute_col;
+
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
+
+      if (absolute_col < num_columns) {
+        auto my_byte =
+            input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
+
+        // every thread that is participating in the warp has a byte, but it's column-based
+        // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make
+        // the bytes we actually write.
+        for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+          // lead thread in each warp writes data
+          auto const validity_write_offset =
+              validity_data_row_length * (relative_row + i) + relative_col / 8;
+          if (threadIdx.x % detail::warp_size == 0) {
+            if (cols_left <= 8) {
+              // write byte
+              this_shared_block[validity_write_offset] = validity_data & 0xFF;
+            } else if (cols_left <= 16) {
+              // write int16
+              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data & 0xFFFF;
+            } else if (cols_left <= 24) {
+              // write int16 and then int8
+              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data & 0xFFFF;
+              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+            } else {
+              // write int32
+              *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data;
+            }
+          }
+        }
+      }
+    }
+
+    // make sure entire block has finished copy
+    group.sync();
+
+    // now async memcpy the shared memory out to the final destination
+    for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
+      auto const relative_row = row - block.start_row;
+      auto const output_ptr =
+          output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
+      auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+      cuda::memcpy_async(
+          output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes,
+          shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+    }
+  }
+
+  // wait for last blocks of data to arrive
+  for (int validity_block = 0;
+       validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+       ++validity_block) {
+    shared_block_barriers[validity_block].arrive_and_wait();
+  }
+}
+
+static __device__ std::tuple<size_type, size_type>
+get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) {
+  auto const col_size_bytes = num_cols * col_size_size;
+  auto const col_offset_bytes = num_cols * col_offset_size;
+
+  return {col_size_bytes, col_offset_bytes};
+}
+
+/**
+ * @brief ensure `read_ahead` buffer blocks are fetched
+ *
+ * @param fetch_index internal state passed into the function
+ * @param processing_index index where processing is occuring
+ * @param read_ahead_count how many blocks to read ahead
+ * @param max_resident_blocks how many blocks can be loaded at once
+ * @param total_blocks total number of blocks overall
+ * @param block_infos pointer to the block infos
+ * @param col_sizes pointer to column size information
+ * @param col_offsets pointer to the table's column offsets
+ * @param row_offsets pointer to offsets for each row in the table
+ * @param input_data pointer to the input data
+ * @param shared pointer to shared memory
+ * @param group thread group participating in the fetch
+ * @param block_barrier barriers used for each block
+ * @return
+ */
+static __device__ void
+fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_index,
+                               int const read_ahead_count, int const max_resident_blocks,
+                               int const total_blocks, block_info const *const block_infos,
+                               size_type const *const col_sizes, size_type const *const col_offsets,
+                               size_type const *const row_offsets, int8_t const *const input_data,
+                               int8_t *shared[], cooperative_groups::thread_block const group,
+                               cuda::barrier<cuda::thread_scope_block> *block_barrier) {
+  for (; fetch_index < static_cast<size_t>(total_blocks) &&
+         fetch_index < (processing_index + read_ahead_count);
+       ++fetch_index) {
+    auto const fetch_block =
+        block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
+    auto const fetch_block_start_row = fetch_block.start_row;
+    auto const fetch_block_end_row = fetch_block.end_row;
+    auto const starting_col_offset = col_offsets[fetch_block.start_col];
+
+    auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
+    auto const num_fetch_cols = fetch_block.num_cols();
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+        sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
+    auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+
+    // if we have fetched all buffers, we need to wait for processing
+    // to complete on them before we can use them again
+    if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
+      fetch_barrier.arrive_and_wait();
+    }
+
+    auto shared_row_offset = 0;
+    // copy the data for column sizes
+    cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset],
+                       &col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
+    shared_row_offset += col_size_bytes;
+    // copy the data for column offsets
+    cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset],
+                       &col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
+    shared_row_offset += col_offset_bytes;
+    shared_row_offset = align_offset(shared_row_offset, 8);
+
+    for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
+         row <= fetch_block_end_row; row += blockDim.x) {
+      auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+      // copy the main
+      cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
+                         &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size,
+                         fetch_barrier);
+    }
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param row_offsets
+ * @param output_data
+ * @param output_nm
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param block_infos information about the blocks of work
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_to_columns(const size_type num_rows, const size_type num_columns,
+                                const size_type shmem_used_per_block, const size_type *row_offsets,
+                                int8_t **output_data, const size_type *_col_sizes,
+                                const size_type *_col_offsets, const block_info *block_infos,
+                                const size_type num_block_infos, const int8_t *input_data) {
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the block_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
+  // to shared memory for each of the blocks that we work on
+
+  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  auto group = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[stages_count];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < stages_count; ++i) {
+      init(&block_barrier[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
+                                   (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
+
+  auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)),
+                               col_offset_size = sizeof(decltype(*_col_offsets))](
+                                  int const num_cols,
+                                  int const num_rows) -> std::tuple<size_type, size_type> {
+    auto const col_size_bytes = num_cols * col_size_size;
+    auto const col_offset_bytes = num_cols * col_offset_size;
+
+    return {col_size_bytes, col_offset_bytes};
+  };
+
+  size_t fetch;
+  size_t subset;
+  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+    // Fetch ahead up to stages_count subsets
+    fetch_blocks_for_row_to_column(fetch, subset, stages_count, stages_count, blocks_remaining,
+                                   block_infos, _col_sizes, _col_offsets, row_offsets, input_data,
+                                   shared, group, block_barrier);
+
+    auto &subset_barrier = block_barrier[subset % stages_count];
+    // ensure our data is ready
+    subset_barrier.arrive_and_wait();
+
+    auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+
+    auto const rows_in_block = block.num_rows();
+    auto const cols_in_block = block.num_cols();
+
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block);
+    // auto shared_row_offsets = shared[subset];
+    auto shared_col_sizes = reinterpret_cast<size_type *>(shared[subset % stages_count]);
+    auto shared_col_offsets =
+        reinterpret_cast<size_type *>(&shared[subset % stages_count][col_size_bytes]);
+
+    auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
+
+    auto block_row_size = block.get_row_size(_col_offsets, _col_sizes);
+
+    // now we copy from shared memory to final destination.
+    // the data is laid out in rows in shared memory, so the reads
+    // for a column will be "vertical". Because of this and the different
+    // sizes for each column, this portion is handled on row/column basis.
+    // to prevent each thread working on a single row and also to ensure
+    // that all threads can do work in the case of more threads than rows,
+    // we do a global index instead of a double for loop with col/row.
+    for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+      auto const relative_col = index % cols_in_block;
+      auto const relative_row = index / cols_in_block;
+      auto const absolute_col = relative_col + block.start_col;
+      auto const absolute_row = relative_row + block.start_row;
+
+      auto const shared_memory_row_offset = block_row_size * relative_row;
+      auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
+                                        shared_memory_row_offset + shared_row_offset;
+      auto const column_size = shared_col_sizes[relative_col];
+
+      int8_t *shmem_src = &shared[subset % stages_count][shared_memory_offset];
+      int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
+
+      cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
+    }
+    group.sync();
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+    block_barrier[i].arrive_and_wait();
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param offsets
+ * @param output_nm
+ * @param validity_offsets offset into input data row for validity data
+ * @param block_infos information about the blocks of work
+ * @param num_block_infos number of infos in blocks array
+ * @param input_data pointer to input data
+ *
+ */
+__global__ void copy_validity_to_columns(
+    const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
+    const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
+    const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+      shared_data, shared_data + shmem_used_per_block / 2};
+
+  // per conversation with DaveB
+  // each thread of warp reads a single byte of validity - so we read 32 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const blocks_remaining =
+      std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+               (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+      shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+      init(&shared_block_barriers[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+    auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+    if (validity_block != validity_index) {
+      shared_block_barriers[validity_index].arrive_and_wait();
+    }
+    int8_t *this_shared_block = shared_blocks[validity_block % 2];
+    auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+    auto const block_start_col = block.start_col;
+    auto const block_start_row = block.start_row;
+
+    auto const num_block_cols = block.num_cols();
+    auto const num_block_rows = block.num_rows();
+
+    auto const num_sections_x = (num_block_cols + 7) / 8;
+    auto const num_sections_y = (num_block_rows + 31) / 32;
+    auto const validity_data_col_length = align_offset(num_sections_y, 4);
+    auto const total_sections = num_sections_x * num_sections_y;
+
+    int const warp_id = threadIdx.x / detail::warp_size;
+    int const lane_id = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+
+    // the block is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_block) {
+      // convert to rows and cols
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
+
+      auto const relative_col = section_x * 8;
+      auto const relative_row = section_y * 32 + lane_id;
+      auto const absolute_col = relative_col + block_start_col;
+      auto const absolute_row = relative_row + block_start_row;
+      auto const rows_left = num_rows - absolute_row;
+
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
+
+      if (absolute_row < num_rows) {
+        auto const my_byte =
+            input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
+
+        // so every thread that is participating in the warp has a byte, but it's row-based
+        // data and we need it in column-based. So we shiffle the bits around to make
+        // the bytes we actually write.
+        for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
+             ++i, byte_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+          // lead thread in each warp writes data
+          if (threadIdx.x % detail::warp_size == 0) {
+            auto const validity_write_offset =
+                validity_data_col_length * (relative_col + i) + relative_row / 8;
+
+            if (rows_left <= 8) {
+              // write byte
+              this_shared_block[validity_write_offset] = validity_data & 0xFF;
+            } else if (rows_left <= 16) {
+              // write int16
+              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data & 0xFFFF;
+            } else if (rows_left <= 24) {
+              // write int16 and then int8
+              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data & 0xFFFF;
+              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+            } else {
+              // write int32
+              *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
+                  validity_data;
+            }
+          }
+        }
+      }
+    }
+
+    // make sure entire block has finished copy
+    group.sync();
+
+    // now async memcpy the shared
+    for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
+      auto const relative_col = col - block.start_col;
+
+      cuda::memcpy_async(
+          output_nm[col] + word_index(block_start_row),
+          &this_shared_block[validity_data_col_length * relative_col],
+          util::div_rounding_up_unsafe(num_block_rows, 8),
+          shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+    }
+  }
+
+  // wait for last blocks of data to arrive
+  auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ?
+                                      NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED :
+                                      blocks_remaining;
+  for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
+    shared_block_barriers[validity_block].arrive_and_wait();
+  }
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
 /**
  * Calculate the dimensions of the kernel for fixed width only columns.
  * @param [in] num_columns the number of columns being copied.
@@ -317,7 +953,6 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
                                         const cudf::size_type num_rows,
                                         const cudf::size_type size_per_row, dim3 &blocks,
                                         dim3 &threads) {
-
   // We have found speed degrades when a thread handles more than 4 columns.
   // Each block is 2 dimensional. The y dimension indicates the columns.
   // We limit this to 32 threads in the y dimension so we can still
@@ -327,10 +962,9 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
   // in the x dimension because we use atomic operations at the block
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
-  int y_block_size = (num_columns + 3) / 4;
-  if (y_block_size > 32) {
+  int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4);
+  if (y_block_size > 32)
     y_block_size = 32;
-  }
   int x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
   // If someone configures the GPU to only have 16 KB this might not work.
@@ -373,15 +1007,15 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
  * going from start row and containing the next num_rows.  Most of the parameters passed
  * into this function are common between runs and should be calculated once.
  */
-static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
-    const cudf::size_type start_row, const cudf::size_type num_rows,
-    const cudf::size_type num_columns, const cudf::size_type size_per_row,
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
-    std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
-    std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
-    const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr) {
+static std::unique_ptr<cudf::column>
+fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows,
+                            const cudf::size_type num_columns, const cudf::size_type size_per_row,
+                            rmm::device_uvector<cudf::size_type> &column_start,
+                            rmm::device_uvector<cudf::size_type> &column_size,
+                            rmm::device_uvector<const int8_t *> &input_data,
+                            rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
+                            const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row,
+                            rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   int64_t total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
   CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
@@ -397,30 +1031,23 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
   dim3 blocks;
   dim3 threads;
   int shared_size =
-      calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
   copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-      start_row, num_rows, num_columns, size_per_row, column_start->data(), column_size->data(),
-      input_data->data(), input_nm->data(), data->mutable_view().data<int8_t>());
+      start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
+      input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                                 rmm::device_buffer{}, stream, mr);
+                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
 }
 
 static cudf::data_type get_data_type(const cudf::column_view &v) {
   return v.type();
 }
 
-static bool is_fixed_width(const cudf::data_type &t) {
-  return cudf::is_fixed_width(t);
-}
-
-static inline int32_t align_offset(int32_t offset, std::size_t alignment) {
-  return (offset + alignment - 1) & ~(alignment - 1);
-}
-
 static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
-  return std::all_of(schema.begin(), schema.end(), cudf::java::is_fixed_width);
+  return std::all_of(schema.begin(), schema.end(),
+                     [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
 }
 
 /**
@@ -449,30 +1076,443 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add it
   // in
-  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  int32_t validity_bytes_needed =
+      (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
   return align_offset(at_offset, 8); // 8 bytes (64 bits)
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+template <typename iterator>
+static size_type compute_column_information(iterator begin, iterator end,
+                                            std::vector<size_type> &column_starts,
+                                            std::vector<size_type> &column_sizes) //,
+// std::function<void(T)> nested_type_cb)
+{
+  size_type fixed_width_size_per_row = 0;
+  for (auto cv = begin; cv != end; ++cv) {
+    auto col_type = std::get<0>(*cv);
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    std::size_t const alignment_needed = col_size; // They are the same for fixed width types
+    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+
+  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  column_starts.push_back(validity_offset);
+
+  return fixed_width_size_per_row;
+}
+
+std::vector<detail::block_info>
+build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
+                           size_type const &shmem_limit_per_block,
+                           std::vector<row_batch> const &row_batches) {
+  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+  auto const column_stride = align_offset(
+      [&]() {
+        if (desired_rows_and_columns > num_columns) {
+          // not many columns, group it into 8s and ship it off
+          return std::min(8, num_columns);
+        } else {
+          return util::round_down_safe(desired_rows_and_columns, 8);
+        }
+      }(),
+      8);
+  // we fit as much as we can given the column stride
+  auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride);
+
+  std::vector<detail::block_info> validity_block_infos;
+  for (int col = 0; col < num_columns; col += column_stride) {
+    int current_window_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+    int row = 0;
+    while (row < num_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(row_stride, rows_left_in_batch);
+
+      validity_block_infos.emplace_back(detail::block_info{
+          col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
+      row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  }
+
+  return validity_block_infos;
+}
+
+std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
+                                          std::vector<size_type> const &column_starts,
+                                          std::vector<row_batch> const &row_batches,
+                                          size_type const total_number_of_rows,
+                                          size_type const &shmem_limit_per_block) {
+  std::vector<block_info> block_infos;
+
+  // block infos are organized with the windows going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_window_width = 0;
+  int current_window_start_col = 0;
+
+  // build the blocks for a specific set of columns
+  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
+                          int const start_col, int const end_col, int const desired_window_height) {
+    int current_window_start_row = 0;
+    int current_window_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+    int i = 0;
+    while (i < total_number_of_rows) {
+      if (rows_left_in_batch == 0) {
+        current_window_row_batch++;
+        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+      }
+      int const window_height = std::min(desired_window_height, rows_left_in_batch);
+
+      block_infos.emplace_back(detail::block_info{
+          start_col, current_window_start_row, end_col,
+          std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
+          current_window_row_batch});
+
+      i += window_height;
+      current_window_start_row += window_height;
+      rows_left_in_batch -= window_height;
+    }
+  };
+
+  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+  // would be memory cache line sized access, but since other blocks will read/write the edges this
+  // may not turn out to be overly important. For now, we will attempt to build a square window as
+  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
+  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
+  // bytes, not rows or columns.
+  size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
+  int const window_height =
+      std::clamp(util::round_up_safe<int>(
+                     optimal_square_len <= (size_type)column_sizes.size() ?
+                         std::min(optimal_square_len / column_sizes[0], total_number_of_rows) :
+                         row_batches[0].row_count / 2,
+                     32),
+                 1, row_batches[0].row_count);
+
+  auto calc_admin_data_size = [](int num_cols) -> size_type {
+    // admin data is the column sizes and column start information.
+    // this is copied to shared memory as well and needs to be accounted for
+    // in the window calculation.
+    return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
+  };
+
+  int row_size = 0;
+
+  // march each column and build the blocks of appropriate sizes
+  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    std::size_t alignment_needed = col_size; // They are the same for fixed width types
+    auto row_size_aligned = detail::align_offset(row_size, alignment_needed);
+    auto row_size_with_this_col = row_size_aligned + col_size;
+    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
+
+    if (row_size_with_end_pad * window_height +
+            calc_admin_data_size(col - current_window_start_col) >
+        shmem_limit_per_block) {
+      // too large, close this window, generate vertical blocks and restart
+      build_blocks(current_window_start_col, col - 1, window_height);
+      row_size =
+          detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+      row_size += col_size; // alignment required for shared memory window boundary to match
+                            // alignment of output row
+      current_window_start_col = col;
+      current_window_width = 0;
+    } else {
+      row_size = row_size_with_this_col;
+      current_window_width++;
+    }
+  }
+
+  // build last set of blocks
+  if (current_window_width > 0) {
+    build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
+  }
+
+  return block_infos;
+}
+
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+} // namespace detail
+
 std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
                                                            rmm::cuda_stream_view stream,
                                                            rmm::mr::device_memory_resource *mr) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+  // Potential optimization for window sizes.
+  const size_type num_columns = tbl.num_columns();
+  const size_type num_rows = tbl.num_rows();
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int total_shmem;
+  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  // TODO: kernels fail to launch if we use all the available shared memory.
+  total_shmem -= 1024;
+
+  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+
+  // break up the work into blocks, which are a starting and ending row/col #.
+  // this window size is calculated based on the shared memory size available
+  // we want a single block to fill up the entire shared memory space available
+  // for the transpose-like conversion.
+
+  // There are two different processes going on here. The GPU conversion of the data
+  // and the writing of the data into the list of byte columns that are a maximum of
+  // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
+  // this limitation because the column must own the data inside and as a result it must be
+  // a distinct allocation for that column. Copying the data into these final buffers would
+  // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
+  // The windows are broken at the boundaries of specific rows based on the row sizes up
+  // to that point. These are row batches and they are decided first before building the
+  // windows so the windows can be properly cut around them.
+
+  // Get the pointers to the input columnar data ready
+  std::vector<int8_t const *> input_data;
+  std::vector<bitmask_type const *> input_nm;
+  input_data.reserve(num_columns);
+  input_nm.reserve(num_columns);
+  for (size_type column_number = 0; column_number < num_columns; column_number++) {
+    column_view cv = tbl.column(column_number);
+    auto const col_type = cv.type();
+    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+    if (!nested_type) {
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+  }
 
+  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+  auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
+
+  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
+  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
+  std::vector<size_type> column_sizes;  // byte size of each column
+  std::vector<size_type> column_starts; // offset of column inside a row including alignment
+  std::vector<column_view>
+      variable_width_columns; // list of the variable width columns in the table
+  row_sizes.reserve(num_rows);
+  row_offsets.reserve(num_rows);
+  column_sizes.reserve(num_columns);
+  column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
+
+  auto iter =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                      [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+                                        return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+                                      });
+
+  size_type fixed_width_size_per_row =
+      detail::compute_column_information(iter, iter + num_columns, column_starts,
+                                         column_sizes); //,
+  //    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
+  /*  size_type fixed_width_size_per_row = 0;
+    for (int col = 0; col < num_columns; ++col) {
+      auto cv          = tbl.column(col);
+      auto col_type    = cv.type();
+      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+
+      if (nested_type) { variable_width_columns.push_back(cv); }
+
+      // a list or string column will write a single uint64
+      // of data here for offset/length
+      auto col_size = nested_type ? 8 : size_of(col_type);
+
+      // align size for this type
+      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
+      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+      column_starts.push_back(fixed_width_size_per_row);
+      column_sizes.push_back(col_size);
+      fixed_width_size_per_row += col_size;
+    }*/
+
+  auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+
+  std::vector<detail::row_batch> row_batches;
+
+  auto calculate_variable_width_row_data_size = [](int const row) {
+    // each level of variable-width data will add an offset/length
+    // uint64 of data. The first of which is inside the fixed-width
+    // data itself and needs to be aligned based on what is around
+    // that data. This is handled above with the fixed-width calculations
+    // for that reason. We may still need to add more of these offset/length
+    // combinations if the nesting is deeper than one level as these
+    // will be included in the variable-width data blob at the end of the
+    // row.
+    return 0;
+    /*      auto c = variable_width_columns[col];
+            while (true) {
+              auto col_offsets   = c.child(0).data<size_type>();
+              auto col_data_size = size_of(c.child(1).type());
+              std::size_t alignment_needed  = col_data_size;
+
+            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
+            if (c.num_children() == 0) {
+              break;
+            }
+            c = c.child(1);
+          }
+    */
+  };
+
+  uint64_t row_batch_size = 0;
+  uint64_t total_table_size = 0;
+  size_type row_batch_rows = 0;
+  uint64_t row_offset = 0;
+
+  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
+  // calculate the size of each row's variable-width data and validity as well.
+  auto validity_size = num_bitmask_words(num_columns) * 4;
+  for (int row = 0; row < num_rows; ++row) {
+    auto aligned_row_batch_size =
+        detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
+    row_sizes[row] = fixed_width_size_per_row;
+    // validity is byte aligned
+    row_sizes[row] += validity_size;
+    // variable width data is 8-byte aligned
+    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
+                     calculate_variable_width_row_data_size(row); // rows are 8 byte aligned
+
+    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
+        (uint64_t)std::numeric_limits<size_type>::max()) {
+      // a new batch starts at the last 32-row boundary
+      row_batches.push_back(
+          detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+      row_batch_size = 0;
+      row_batch_rows = row_batch_rows & 31;
+      row_offset = 0;
+      aligned_row_batch_size = 0;
+    }
+    row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+    row_offsets.push_back(row_offset);
+    row_batch_size = aligned_row_batch_size + row_sizes[row];
+    row_offset += row_sizes[row];
+    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+    total_table_size += row_sizes[row];
+    row_batch_rows++;
+  }
+  if (row_batch_size > 0) {
+    row_batches.push_back(
+        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+  }
+
+  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
+
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
+  output_data.reserve(row_batches.size());
+  for (uint i = 0; i < row_batches.size(); ++i) {
+    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+    output_data.push_back(static_cast<int8_t *>(temp.data()));
+    output_buffers.push_back(std::move(temp));
+  }
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+
+  std::vector<detail::block_info> block_infos =
+      build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+
+  // blast through the entire table and convert it
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS));
+  dim3 threads(256);
+
+  detail::copy_from_columns<<<blocks, threads, total_shmem, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
+      dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
+      reinterpret_cast<int8_t **>(dev_output_data.data()));
+
+  auto validity_block_infos =
+      build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  dim3 validity_blocks(
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+  detail::copy_validity_from_columns<<<validity_blocks, validity_threads, total_shmem,
+                                       stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
+      column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
+      dev_input_nm.data());
+
+  // split up the output buffer into multiple buffers based on row batch sizes
+  // and create list of byte columns
+  int offset_offset = 0;
+  std::vector<std::unique_ptr<cudf::column>> ret;
+  for (uint i = 0; i < row_batches.size(); ++i) {
+    // compute offsets for this row batch
+    std::vector<size_type> offset_vals;
+    offset_vals.reserve(row_batches[i].row_count + 1);
+    size_type cur_offset = 0;
+    offset_vals.push_back(cur_offset);
+    for (int row = 0; row < row_batches[i].row_count; ++row) {
+      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
+      offset_vals.push_back(cur_offset);
+    }
+    offset_offset += row_batches[i].row_count;
+
+    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
+    auto offsets = std::make_unique<column>(data_type{type_id::INT32},
+                                            (size_type)offset_vals.size(), dev_offsets.release());
+
+    auto data = std::make_unique<column>(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes,
+                                         std::move(output_buffers[i]));
+
+    ret.push_back(
+        cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0,
+                                rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
+  }
+
+  return ret;
+#else
+  CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
+  return {};
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+}
+
+std::vector<std::unique_ptr<cudf::column>>
+old_convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource *mr) {
   const cudf::size_type num_columns = tbl.num_columns();
 
   std::vector<cudf::data_type> schema;
   schema.resize(num_columns);
-  std::transform(tbl.begin(), tbl.end(), schema.begin(), cudf::java::get_data_type);
+  std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
 
-  if (are_all_fixed_width(schema)) {
+  if (detail::are_all_fixed_width(schema)) {
     std::vector<cudf::size_type> column_start;
     std::vector<cudf::size_type> column_size;
 
-    int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
+    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
+    auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
 
     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
@@ -489,8 +1529,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
-    auto dev_input_data = copy_to_dev_async(input_data, stream, mr);
-    auto dev_input_nm = copy_to_dev_async(input_nm, stream, mr);
+    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+    auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
 
     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
@@ -506,7 +1546,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
       cudf::size_type row_count = num_rows - row_start;
       row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-      ret.emplace_back(fixed_width_convert_to_rows(
+      ret.emplace_back(detail::fixed_width_convert_to_rows(
           row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
           dev_input_data, dev_input_nm, *zero, *step, stream, mr));
     }
@@ -521,7 +1561,129 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
                                                std::vector<cudf::data_type> const &schema,
                                                rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource *mr) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+  // verify that the types are what we expect
+  cudf::column_view child = input.child();
+  cudf::type_id list_type = child.type().id();
+  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  cudf::size_type num_columns = schema.size();
+  cudf::size_type num_rows = input.parent().size();
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int total_shmem;
+  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  // TODO: unable to launch a kernel with all shared used
+  total_shmem -= 1024;
+  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+
+  std::vector<cudf::size_type> column_starts;
+  std::vector<cudf::size_type> column_sizes;
+
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
+    return std::make_tuple(schema[i], nullptr);
+  });
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+      iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {});
+
+  size_type validity_size = num_bitmask_words(num_columns) * 4;
+
+  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
+
+  // Ideally we would check that the offsets are all the same, etc. but for now
+  // this is probably fine
+  CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+  auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+
+  // build the row_batches from the passed in list column
+  std::vector<detail::row_batch> row_batches;
+
+  row_batches.push_back(detail::row_batch{child.size(), num_rows});
+
+  // Allocate the columns we are going to write into
+  std::vector<std::unique_ptr<cudf::column>> output_columns;
+  std::vector<int8_t *> output_data;
+  std::vector<cudf::bitmask_type *> output_nm;
+  for (cudf::size_type i = 0; i < num_columns; i++) {
+    auto column = cudf::make_fixed_width_column(schema[i], num_rows,
+                                                cudf::mask_state::UNINITIALIZED, stream, mr);
+    auto mut = column->mutable_view();
+    output_data.emplace_back(mut.data<int8_t>());
+    output_nm.emplace_back(mut.null_mask());
+    output_columns.emplace_back(std::move(column));
+  }
+
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+  auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
+
+  std::vector<detail::block_info> block_infos =
+      build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+#if defined(DEBUG)
+  dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size()));
+#else
+  dim3 threads(std::min(256, (int)child.size()));
+#endif
+  detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
+      block_infos.size(), child.data<int8_t>());
+
+  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+  auto const column_stride = [&]() {
+    if (desired_rows_and_columns > num_columns) {
+      // not many columns, group it into 8s and ship it off
+      return std::min(8, num_columns);
+    } else {
+      return util::round_down_safe(desired_rows_and_columns, 8);
+    }
+  }();
+  auto const row_stride = [&]() {
+    // we fit as much as we can, we know the column stride now, so calculate the row
+    return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
+    /*    if (desired_rows_and_columns > num_rows) {
+          return std::min(32, num_rows);
+        } else {
+          return util::round_down_safe(desired_rows_and_columns, 32);
+        }*/
+  }();
+  std::vector<detail::block_info> validity_block_infos;
+  for (int col = 0; col < num_columns; col += column_stride) {
+    for (int row = 0; row < num_rows; row += row_stride) {
+      validity_block_infos.emplace_back(
+          detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
+                             std::min(row + row_stride - 1, num_rows - 1)});
+    }
+  }
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  dim3 validity_blocks(
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+
+  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+  detail::
+      copy_validity_to_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+          num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+          dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
+          validity_block_infos.size(), child.data<int8_t>());
+
+  return std::make_unique<cudf::table>(std::move(output_columns));
+#else
+  CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
+  return {};
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+}
 
+std::unique_ptr<cudf::table> old_convert_from_rows(cudf::lists_column_view const &input,
+                                                   std::vector<cudf::data_type> const &schema,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource *mr) {
   // verify that the types are what we expect
   cudf::column_view child = input.child();
   cudf::type_id list_type = child.type().id();
@@ -530,19 +1692,19 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 
   cudf::size_type num_columns = schema.size();
 
-  if (are_all_fixed_width(schema)) {
+  if (detail::are_all_fixed_width(schema)) {
     std::vector<cudf::size_type> column_start;
     std::vector<cudf::size_type> column_size;
 
     cudf::size_type num_rows = input.parent().size();
-    int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
+    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
 
     // Ideally we would check that the offsets are all the same, etc. but for now
     // this is probably fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream);
+    auto dev_column_size = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -557,17 +1719,17 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       output_columns.emplace_back(std::move(column));
     }
 
-    auto dev_output_data = copy_to_dev_async(output_data, stream, mr);
-    auto dev_output_nm = copy_to_dev_async(output_nm, stream, mr);
+    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+    auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
     int shared_size =
-        calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+        detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
-    copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-        num_rows, num_columns, size_per_row, dev_column_start->data(), dev_column_size->data(),
-        dev_output_data->data(), dev_output_nm->data(), child.data<int8_t>());
+    detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+        num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
+        dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
 
     return std::make_unique<cudf::table>(std::move(output_columns));
   } else {
@@ -575,5 +1737,4 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   }
 }
 
-} // namespace java
 } // namespace cudf
diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
index 17abde8df19..517202f3892 100644
--- a/java/src/main/native/src/row_conversion.hpp
+++ b/java/src/main/native/src/row_conversion.hpp
@@ -25,12 +25,24 @@
 namespace cudf {
 namespace java {
 
+std::vector<std::unique_ptr<cudf::column>>
+old_convert_to_rows(cudf::table_view const &tbl,
+                    // TODO need something for validity
+                    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+                    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 std::vector<std::unique_ptr<cudf::column>>
 convert_to_rows(cudf::table_view const &tbl,
                 // TODO need something for validity
                 rmm::cuda_stream_view stream = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::table>
+old_convert_from_rows(cudf::lists_column_view const &input,
+                      std::vector<cudf::data_type> const &schema,
+                      rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+                      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 std::unique_ptr<cudf::table>
 convert_from_rows(cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
                   rmm::cuda_stream_view stream = rmm::cuda_stream_default,

From 92f52cd2b97ac03dec5e9752f1d6cd4e08b4323e Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 21 Sep 2021 21:39:00 +0000
Subject: [PATCH 50/80] fixing validity alignment bugs

---
 cpp/src/row_conversion/row_conversion.cu    | 144 +++++++++----
 cpp/tests/row_conversion/row_conversion.cpp | 226 +++++++++++++++++++-
 java/src/main/native/src/row_conversion.cu  |  22 +-
 3 files changed, 333 insertions(+), 59 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 42c40e0542d..0409a65b630 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -493,7 +493,7 @@ __global__ void copy_from_columns(const size_type num_rows,
                  input_src,
                  col_size);
 
-        // copy the main
+        // copy the element to global memory
         cuda::memcpy_async(
           &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier);
       }
@@ -568,7 +568,11 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-  constexpr bool print_debug = false;  //(threadIdx.x==0 || threadIdx.x == 32) && blockIdx.x == 0;
+    int8_t* output_check_addr = nullptr;
+    int8_t* output_block_start = nullptr;
+    size_type output_block_size = 0;
+
+  bool print_debug = false; //threadIdx.x==0 && blockIdx.x == 0;
   //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
   if (print_debug) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -659,12 +663,14 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
 
     if (print_debug)
       printf(
-        "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side "
+        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, warp size "
         "%d\n",
         threadIdx.x,
         blockIdx.x,
         warp_id,
         total_sections,
+        num_sections_x,
+        num_sections_y,
         warps_per_block,
         blockDim.x,
         detail::warp_size);
@@ -672,10 +678,10 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
          my_section_idx += warps_per_block) {
       // convert to rows and cols
-      auto const section_x = my_section_idx / num_sections_x;
-      auto const section_y = my_section_idx % num_sections_x;
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
 
-      if (print_debug) printf("working on section %d of %d...\n", section_x, num_sections_x);
+      if (print_debug) printf("working on section %d,%d - %d of %d...\n", section_x, section_y, my_section_idx, total_sections);
       auto const relative_col = section_x * 32 + lane_id;
       auto const relative_row = section_y * 8;
       auto const absolute_col = relative_col + block.start_col;
@@ -722,7 +728,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             absolute_col);
 
         // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make
+        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make
         // the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
@@ -744,23 +750,23 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             if (cols_left <= 8) {
               // write byte
               if (print_debug)
-                printf("writing single byte to shared offset 0x%x which is %p...\n",
-                       validity_write_offset,
+                printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n",
+                threadIdx.x, blockIdx.x, validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
             } else if (cols_left <= 16) {
               // write int16
               if (print_debug)
-                printf("writing two bytes to shared offset 0x%x which is %p...\n",
-                       validity_write_offset,
+                printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n",
+                threadIdx.x, blockIdx.x, validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data & 0xFFFF;
             } else if (cols_left <= 24) {
               // write int16 and then int8
               if (print_debug)
-                printf("writing three bytes to shared offset 0x%x which is %p...\n",
-                       validity_write_offset,
+                printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n",
+                threadIdx.x, blockIdx.x, validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data & 0xFFFF;
@@ -768,8 +774,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             } else {
               // write int32
               if (print_debug)
-                printf("writing 4 bytes to shared offset 0x%x which is %p...\n",
-                       validity_write_offset,
+                printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n",
+                threadIdx.x, blockIdx.x, validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data;
@@ -816,6 +822,18 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
       auto const output_ptr =
         output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+
+/*      if (num_rows >= 5006) {
+        auto const row5006_col_65 = output_data[block.buffer_num] + row_offsets[5006] + validity_offset + 65 / 8;
+        if (output_ptr >= row5006_col_65 && output_ptr <= row5006_col_65 + 4) {
+          printf("%d %d - writing bytes from %p(0x%x)-%p to %p-%p that overlap global %p(0x%x), which is row 5006, col 65!\n", threadIdx.x, blockIdx.x, &this_shared_block[validity_data_row_length * relative_row], this_shared_block[validity_data_row_length * relative_row], &this_shared_block[validity_data_row_length * relative_row + num_bytes], output_ptr, output_ptr + num_bytes, row5006_col_65, *row5006_col_65);
+          printf("%d %d - block information\n%d,%d -> %d,%d\n%d columns, %d rows\n", threadIdx.x, blockIdx.x, block.start_col, block.start_row, block.end_col, block.end_row, block.num_cols(), block.num_rows());
+          output_check_addr = row5006_col_65;
+          output_block_start = output_ptr;
+          output_block_size = num_bytes;
+        }
+      }*/
+
       cuda::memcpy_async(
         output_ptr,
         &this_shared_block[validity_data_row_length * relative_row],
@@ -851,6 +869,17 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
        ++validity_block) {
     shared_block_barriers[validity_block].arrive_and_wait();
   }
+  if (output_check_addr != nullptr) {
+    printf("output check after write to %p - 0x%x\n", output_check_addr, *output_check_addr);
+    for (int i=0; i<output_block_size; ++i) {
+        if (i % 16 == 0) {
+          printf("\n%p - ", &output_block_start[i]);
+        }
+        printf("0x%x ", output_block_start[i]);
+    }
+    printf("\n");
+  }
+
 }
 
 static __device__ std::tuple<size_type, size_type> get_admin_data_sizes(size_t col_size_size,
@@ -901,12 +930,12 @@ static __device__ void fetch_blocks_for_row_to_column(
   for (; fetch_index < static_cast<size_t>(total_blocks) &&
          fetch_index < (processing_index + read_ahead_count);
        ++fetch_index) {
-    if (debug_print)
-      printf("fetching block %lu of %d\n",
-             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
-             total_blocks);
     auto const fetch_block =
       block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
+      if (debug_print)
+      printf("fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending offset %p\n",
+             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
+             total_blocks, fetch_block.start_col, fetch_block.end_col, &col_offsets[fetch_block.start_col], &col_offsets[fetch_block.end_col]);
     auto const fetch_block_start_row = fetch_block.start_row;
     auto const fetch_block_end_row   = fetch_block.end_row;
     auto const starting_col_offset   = col_offsets[fetch_block.start_col];
@@ -948,7 +977,7 @@ static __device__ void fetch_blocks_for_row_to_column(
              &shared[fetch_index % max_resident_blocks][shared_row_offset],
              &col_offsets[fetch_block.start_col],
              col_offset_bytes);
-    cuda::memcpy_async(group,
+   cuda::memcpy_async(group,
                        &shared[fetch_index % max_resident_blocks][shared_row_offset],
                        &col_offsets[fetch_block.start_col],
                        col_offset_bytes,
@@ -983,7 +1012,7 @@ static __device__ void fetch_blocks_for_row_to_column(
                fetch_index % max_resident_blocks,
                &shared[fetch_index % max_resident_blocks][shared_offset],
                &input_data[row_offsets[row] + starting_col_offset]);
-      // copy the main
+       // copy the main
       cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
                          &input_data[row_offsets[row] + starting_col_offset],
                          fetch_block_row_size,
@@ -1029,7 +1058,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
   // to shared memory for each of the blocks that we work on
 
-  /*constexpr*/ bool debug_print  = false;  // threadIdx.x == 0;
+  /*constexpr*/ bool debug_print  = false; //threadIdx.x == 0 && blockIdx.x == 0;
   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
   auto group                      = cooperative_groups::this_thread_block();
   extern __shared__ int8_t shared_data[];
@@ -1037,12 +1066,14 @@ __global__ void copy_to_columns(const size_type num_rows,
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
+    printf("%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x);
     /*    printf("Row Offsets:\n");
     for (int i=0; i<num_rows; ++i) {
     printf("%d: %d\n", i, row_offsets[i]);
     }*/
-    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
+    printf("Row offsets are at %p\n", row_offsets);
+    printf("col sizes are at %p and col offsets at %p\n", _col_sizes, _col_offsets);
+    printf("output data to %p and input data at %p\n", output_data[block_infos[blockIdx.x].buffer_num], input_data);
     printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
     printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
     printf("group is %d threads\n", group.size());
@@ -1227,7 +1258,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-  bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
+  bool print_debug = false; //threadIdx.x == 0 && blockIdx.x == 0;
   // bool print_debug = false;
   //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
   if (print_debug) {
@@ -1293,7 +1324,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
 
     auto const num_sections_x           = (num_block_cols + 7) / 8;
     auto const num_sections_y           = (num_block_rows + 31) / 32;
-    auto const validity_data_col_length = align_offset(num_sections_y, 4);
+    auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections           = num_sections_x * num_sections_y;
 
     if (print_debug) {
@@ -1332,8 +1363,8 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       auto const absolute_col = relative_col + block_start_col;
       auto const absolute_row = relative_row + block_start_row;
       auto const rows_left    = num_rows - absolute_row;
-
-      if (print_debug)
+  
+/*      if (print_debug)
         printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
                threadIdx.x,
                blockIdx.x,
@@ -1345,7 +1376,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
                absolute_row,
                num_rows,
                relative_col,
-               relative_row);
+               relative_row);*/
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
 
       if (absolute_row < num_rows) {
@@ -1362,12 +1393,16 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
           if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
               validity_data_col_length * (relative_col + i) + relative_row / 8;
+            auto const write_5006_offset = 837; //validity_data_col_length * (65 - block_start_col) + (5006 - block_start_row)/8;
 
             if (print_debug)
-              printf("%d - Writing validity data 0x%x to shared memory location %d\n",
+              printf("%d - Writing validity data for column %d, row %d 0x%x to shared memory location %d(%d * (%d + %d) + %d / 8)\n",
                      threadIdx.x,
+                     absolute_col+i,
+                     absolute_row,
                      validity_data,
-                     validity_write_offset);
+                     validity_write_offset, validity_data_col_length, relative_col, i, relative_row);
+
             if (rows_left <= 8) {
               // write byte
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
@@ -1400,11 +1435,25 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       if (print_debug)
-        printf("memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
+        printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
+        threadIdx.x, blockIdx.x, col,
                starting_address,
                word_index(block_start_row),
                &this_shared_block[validity_data_col_length * relative_col],
                words_to_copy * 4);
+/*      if (print_debug) {
+        auto const offset_5006 = validity_data_col_length * relative_col +  (5006 - block_start_row) / 8;
+        printf("%d %d - start_row %d end row %d - byte for row 5006 is offset 0x%x - 0x%x\n", threadIdx.x, blockIdx.x, block_start_row, block.end_row, offset_5006, this_shared_block[offset_5006]);
+        printf("relative column is %d and validity_data_col_length is %d making starting offset 0x%x\n", relative_col, validity_data_col_length, validity_data_col_length * relative_col);
+        for (int i=block_start_row; i<block.end_row; i+=8) {
+          auto const offset = validity_data_col_length * relative_col + (i - block_start_row) / 8;
+          if (i % (10 * 8) == 0) {
+            printf("\n");
+          }
+          printf("%4d(%4d) = 0x%X ", offset, i, this_shared_block[offset]);
+        }
+        printf("\n");
+      }*/
       cuda::memcpy_async(
         output_nm[col] + word_index(block_start_row),
         &this_shared_block[validity_data_col_length * relative_col],
@@ -1644,7 +1693,10 @@ std::vector<detail::block_info> build_validity_block_infos(
     }(),
     8);
   // we fit as much as we can given the column stride
-  auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride);
+  // note that an element in the table takes just 1 bit, but a row with a single
+  // element still takes 8 bytes!
+  auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
+  auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
 
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
@@ -1695,6 +1747,7 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
       }
       int const window_height = std::min(desired_window_height, rows_left_in_batch);
 
+//      printf("block %d, %d to %d, %d\n", start_col, current_window_start_row, end_col, std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1));
       block_infos.emplace_back(detail::block_info{
         start_col,
         current_window_start_row,
@@ -1716,11 +1769,7 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
   // bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
   int const window_height =
-    std::clamp(util::round_up_safe<int>(
-                 optimal_square_len <= (size_type)column_sizes.size()
-                   ? std::min(optimal_square_len / column_sizes[0], total_number_of_rows)
-                   : row_batches[0].row_count / 2,
-                 32),
+    std::clamp(util::round_up_safe<int>(std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], total_number_of_rows), 32),
                1,
                row_batches[0].row_count);
 #if defined(DEBUG)
@@ -1787,7 +1836,7 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
         shmem_limit_per_block);
 #endif
       // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
+      build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
       row_size =
         detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
 #if defined(DEBUG)
@@ -1973,6 +2022,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
             }
             c = c.child(1);
           }
+          exclusive_scan([t](int row_index) {
+            size_type total_row_size = 0;
+            for (int i=0 i<t.num_columns(); ++i) {
+              // compute data prior to validity
+              data_size += compute_type_size();
+              // compute validity size
+              total_row_size += num_columns() / 8;
+              total_row_size = align(data_size + bit_size + variable_size);
+            }
+          }
     */
   };
 
@@ -1984,6 +2043,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
   auto validity_size = num_bitmask_words(num_columns) * 4;
+  // thrust
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
       detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
@@ -2310,8 +2370,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
   auto const column_stride            = [&]() {
     if (desired_rows_and_columns > num_columns) {
-      // not many columns, group it into 8s and ship it off
-      return std::min(8, num_columns);
+      // not many columns, group it into 64s and ship it off
+      return std::min(64, num_columns);
     } else {
       return util::round_down_safe(desired_rows_and_columns, 8);
     }
@@ -2325,6 +2385,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
           return util::round_down_safe(desired_rows_and_columns, 32);
         }*/
   }();
+  printf("column stride is %d and row stride is %d. std::min(%d, util::round_down_safe(%d * 8 / %d, 32))\n", column_stride, row_stride, num_rows, shmem_limit_per_block, column_stride);
+  printf("each block uses %d bytes of shared memory\n", (column_stride / 8) * detail::align_offset(row_stride, 4));
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
     for (int row = 0; row < num_rows; row += row_stride) {
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index e38b37e81a6..26e071eef79 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -33,11 +33,19 @@ TEST_F(ColumnToRowTests, Single)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
   auto old_rows = cudf::old_convert_to_rows(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -47,11 +55,19 @@ TEST_F(ColumnToRowTests, Simple)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
   auto old_rows = cudf::old_convert_to_rows(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -63,11 +79,20 @@ TEST_F(ColumnToRowTests, Tall)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
   cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
   cudf::table_view in(std::vector<cudf::column_view>{a});
+  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
   auto old_rows = cudf::old_convert_to_rows(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -77,10 +102,12 @@ TEST_F(ColumnToRowTests, Wide)
 {
   std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
   std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
 
   for (int i = 0; i < 256; ++i) {
     cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
     views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
   cudf::table_view in(views);
 
@@ -88,6 +115,13 @@ TEST_F(ColumnToRowTests, Wide)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -97,10 +131,13 @@ TEST_F(ColumnToRowTests, SingleByteWide)
 {
   std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
   std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
 
   for (int i = 0; i < 256; ++i) {
     cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
     views.push_back(cols.back());
+
+    schema.push_back(cudf::data_type{cudf::type_id::INT8});
   }
   cudf::table_view in(views);
 
@@ -108,6 +145,59 @@ TEST_F(ColumnToRowTests, SingleByteWide)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Non2Power)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  constexpr auto num_rows = 6 * 1024 + 557;
+  for (int i = 0; i < 131; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j=0; j<old_tbl->num_columns(); ++j) {
+      printf("testing column %d\n", j);
+      if (j==65) {
+        printf("old\n");
+        cudf::test::print(old_tbl->get_column(j));
+        printf("new\n");
+        cudf::test::print(new_tbl->get_column(j));
+      }
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+
   for (uint i = 0; i < old_rows.size(); i++) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
   }
@@ -119,11 +209,69 @@ TEST_F(ColumnToRowTests, Big)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
   std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
   std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
 
-  for (int i = 0; i < 256; ++i) {
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 28; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Bigger)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 128 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
     cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + 4096 * i, r + 4096 * i + 4096));
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
     views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  EXPECT_EQ(old_rows.size(), new_rows.size());
+  for (uint i = 0; i < old_rows.size(); i++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  }
+}
+
+TEST_F(ColumnToRowTests, Biggest)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 128 columns of 2 million rows
+  constexpr auto num_rows = 2 * 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
   cudf::table_view in(views);
 
@@ -238,7 +386,7 @@ TEST_F(RowToColumnTests, SingleByteWide)
   }
 }
 
-TEST_F(RowToColumnTests, non2power)
+TEST_F(RowToColumnTests, Non2Power)
 {
   auto r =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
@@ -246,9 +394,13 @@ TEST_F(RowToColumnTests, non2power)
   std::vector<cudf::column_view> views;
   std::vector<cudf::data_type> schema;
 
-  cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r, r + 13));
-  views.push_back(cols.back());
-  schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  constexpr auto num_rows = 6 * 1024 + 557;
+  for (int i = 0; i < 131; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
   cudf::table_view in(views);
 
   auto old_rows = cudf::old_convert_to_rows(in);
@@ -269,9 +421,67 @@ TEST_F(RowToColumnTests, Big)
   std::vector<cudf::column_view> views;
   std::vector<cudf::data_type> schema;
 
-  for (int i = 0; i < 256; ++i) {
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 28; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Bigger)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
+    cols.push_back(
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    views.push_back(cols.back());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, Biggest)
+{
+  auto r =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema;
+
+  // 28 columns of 1 million rows
+  constexpr auto num_rows = 5 * 1024 * 1024;
+  for (int i = 0; i < 128; ++i) {
     cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + 4096 * i, r + 4096 * i + 4096));
+      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 1babbc6fd1a..9f0df3569a7 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -54,7 +54,9 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 #endif
 
 using cudf::detail::make_device_uvector_async;
-namespace cudf {
+using cudf::detail::warp_size;
+
+namespace cudf::java {
 
 namespace detail {
 
@@ -526,9 +528,9 @@ __global__ void copy_validity_from_columns(
         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
 
-    int const warp_id = threadIdx.x / detail::warp_size;
-    int const lane_id = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
@@ -557,7 +559,7 @@ __global__ void copy_validity_from_columns(
           // lead thread in each warp writes data
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
-          if (threadIdx.x % detail::warp_size == 0) {
+          if (threadIdx.x % warp_size == 0) {
             if (cols_left <= 8) {
               // write byte
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
@@ -855,12 +857,12 @@ __global__ void copy_validity_to_columns(
 
     auto const num_sections_x = (num_block_cols + 7) / 8;
     auto const num_sections_y = (num_block_rows + 31) / 32;
-    auto const validity_data_col_length = align_offset(num_sections_y, 4);
+    auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections = num_sections_x * num_sections_y;
 
-    int const warp_id = threadIdx.x / detail::warp_size;
-    int const lane_id = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
@@ -888,7 +890,7 @@ __global__ void copy_validity_to_columns(
              ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
-          if (threadIdx.x % detail::warp_size == 0) {
+          if (threadIdx.x % warp_size == 0) {
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
 

From 83118d2c63101c31629e6cd3ade17bb772215e75 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 22 Sep 2021 03:11:58 +0000
Subject: [PATCH 51/80] Updates and bug fixes

---
 .../row_conversion/row_conversion.cpp         |   2 +-
 cpp/src/row_conversion/row_conversion.cu      | 206 +++++++-----------
 cpp/tests/row_conversion/row_conversion.cpp   |  36 +--
 java/src/main/native/src/row_conversion.cu    | 106 ++++-----
 4 files changed, 155 insertions(+), 195 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index ad9925e9043..2fe436a22c1 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -19,8 +19,8 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf/row_conversion.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/row_conversion.hpp>
 #include <cudf_test/column_utilities.hpp>
 
 class RowConversion : public cudf::benchmark {
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 0409a65b630..eb3c4b28b6a 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -568,11 +568,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-    int8_t* output_check_addr = nullptr;
-    int8_t* output_block_start = nullptr;
-    size_type output_block_size = 0;
-
-  bool print_debug = false; //threadIdx.x==0 && blockIdx.x == 0;
+  constexpr bool print_debug = false;  // threadIdx.x==0 && blockIdx.x == 0;
   //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
   if (print_debug) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
@@ -663,7 +659,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
 
     if (print_debug)
       printf(
-        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, warp size "
+        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, "
+        "warp size "
         "%d\n",
         threadIdx.x,
         blockIdx.x,
@@ -681,7 +678,12 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
 
-      if (print_debug) printf("working on section %d,%d - %d of %d...\n", section_x, section_y, my_section_idx, total_sections);
+      if (print_debug)
+        printf("working on section %d,%d - %d of %d...\n",
+               section_x,
+               section_y,
+               my_section_idx,
+               total_sections);
       auto const relative_col = section_x * 32 + lane_id;
       auto const relative_row = section_y * 8;
       auto const absolute_col = relative_col + block.start_col;
@@ -751,14 +753,18 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
               // write byte
               if (print_debug)
                 printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n",
-                threadIdx.x, blockIdx.x, validity_write_offset,
+                       threadIdx.x,
+                       blockIdx.x,
+                       validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
             } else if (cols_left <= 16) {
               // write int16
               if (print_debug)
                 printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n",
-                threadIdx.x, blockIdx.x, validity_write_offset,
+                       threadIdx.x,
+                       blockIdx.x,
+                       validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data & 0xFFFF;
@@ -766,7 +772,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
               // write int16 and then int8
               if (print_debug)
                 printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n",
-                threadIdx.x, blockIdx.x, validity_write_offset,
+                       threadIdx.x,
+                       blockIdx.x,
+                       validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data & 0xFFFF;
@@ -775,7 +783,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
               // write int32
               if (print_debug)
                 printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n",
-                threadIdx.x, blockIdx.x, validity_write_offset,
+                       threadIdx.x,
+                       blockIdx.x,
+                       validity_write_offset,
                        &this_shared_block[validity_write_offset]);
               *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
                 validity_data;
@@ -823,63 +833,20 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
         output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
 
-/*      if (num_rows >= 5006) {
-        auto const row5006_col_65 = output_data[block.buffer_num] + row_offsets[5006] + validity_offset + 65 / 8;
-        if (output_ptr >= row5006_col_65 && output_ptr <= row5006_col_65 + 4) {
-          printf("%d %d - writing bytes from %p(0x%x)-%p to %p-%p that overlap global %p(0x%x), which is row 5006, col 65!\n", threadIdx.x, blockIdx.x, &this_shared_block[validity_data_row_length * relative_row], this_shared_block[validity_data_row_length * relative_row], &this_shared_block[validity_data_row_length * relative_row + num_bytes], output_ptr, output_ptr + num_bytes, row5006_col_65, *row5006_col_65);
-          printf("%d %d - block information\n%d,%d -> %d,%d\n%d columns, %d rows\n", threadIdx.x, blockIdx.x, block.start_col, block.start_row, block.end_col, block.end_row, block.num_cols(), block.num_rows());
-          output_check_addr = row5006_col_65;
-          output_block_start = output_ptr;
-          output_block_size = num_bytes;
-        }
-      }*/
-
       cuda::memcpy_async(
         output_ptr,
         &this_shared_block[validity_data_row_length * relative_row],
         num_bytes,
         shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-
-      /*      auto const padding_ptr = output_ptr + num_bytes;
-            auto const padding_needed = -reinterpret_cast<int64_t>(padding_ptr) & 7;
-            if (print_debug) printf(
-                "absolute_row: %d, row_offset for this row: 0x%x, validity data bytes: %d, end
-         address: %p, padding bytes %lu\n", row, row_offsets[row], num_bytes, output_ptr +
-         num_bytes, padding_needed); cuda::memcpy_async(padding_ptr, zero, padding_needed,
-         shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-      */
-
-      /*      if (print_debug) {
-              for (int i=0; i<util::div_rounding_up_unsafe(num_block_rows, 8); i+=4) {
-                printf("%d: 0x%02X %02X %02X %02X\n", i * 8,
-         this_shared_block[validity_data_col_length * relative_col + i] & 0xFF,
-         this_shared_block[validity_data_col_length * relative_col + i + 1] & 0xFF,
-         this_shared_block[validity_data_col_length * col + i + 2] & 0xFF,
-         this_shared_block[validity_data_col_length * relative_col + i + 3] & 0xFF);
-              }
-            }*/
     }
-    //    if (print_debug) printf("looping...\n");
   }
 
-  //  if (print_debug) printf("leaving...\n");
   // wait for last blocks of data to arrive
   for (int validity_block = 0;
        validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
        ++validity_block) {
     shared_block_barriers[validity_block].arrive_and_wait();
   }
-  if (output_check_addr != nullptr) {
-    printf("output check after write to %p - 0x%x\n", output_check_addr, *output_check_addr);
-    for (int i=0; i<output_block_size; ++i) {
-        if (i % 16 == 0) {
-          printf("\n%p - ", &output_block_start[i]);
-        }
-        printf("0x%x ", output_block_start[i]);
-    }
-    printf("\n");
-  }
-
 }
 
 static __device__ std::tuple<size_type, size_type> get_admin_data_sizes(size_t col_size_size,
@@ -932,10 +899,16 @@ static __device__ void fetch_blocks_for_row_to_column(
        ++fetch_index) {
     auto const fetch_block =
       block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
-      if (debug_print)
-      printf("fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending offset %p\n",
-             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
-             total_blocks, fetch_block.start_col, fetch_block.end_col, &col_offsets[fetch_block.start_col], &col_offsets[fetch_block.end_col]);
+    if (debug_print)
+      printf(
+        "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending "
+        "offset %p\n",
+        blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
+        total_blocks,
+        fetch_block.start_col,
+        fetch_block.end_col,
+        &col_offsets[fetch_block.start_col],
+        &col_offsets[fetch_block.end_col]);
     auto const fetch_block_start_row = fetch_block.start_row;
     auto const fetch_block_end_row   = fetch_block.end_row;
     auto const starting_col_offset   = col_offsets[fetch_block.start_col];
@@ -977,7 +950,7 @@ static __device__ void fetch_blocks_for_row_to_column(
              &shared[fetch_index % max_resident_blocks][shared_row_offset],
              &col_offsets[fetch_block.start_col],
              col_offset_bytes);
-   cuda::memcpy_async(group,
+    cuda::memcpy_async(group,
                        &shared[fetch_index % max_resident_blocks][shared_row_offset],
                        &col_offsets[fetch_block.start_col],
                        col_offset_bytes,
@@ -985,23 +958,6 @@ static __device__ void fetch_blocks_for_row_to_column(
     shared_row_offset += col_offset_bytes;
     shared_row_offset = align_offset(shared_row_offset, 8);
 
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0 && fetch_block.start_col == 0 &&
-        fetch_block.start_row <= 51 && fetch_block.end_row >= 51) {
-      printf("Input data for col 0 row 51 is 0x");
-      for (int i = 0; i < col_sizes[0]; ++i) {
-        printf("%x ", input_data[row_offsets[51] + col_offsets[0] + i]);
-      }
-      printf("\n");
-      printf(
-        "this is at offset %d-%d and starting column offset is %d and we're reading %d bytes\n",
-        col_offsets[0],
-        col_offsets[0] + col_sizes[0],
-        starting_col_offset,
-        fetch_block_row_size);
-      auto shared_offset = (51 - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
-      printf("destination is %p", &shared[fetch_index % max_resident_blocks][shared_offset]);
-    }
-
     for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
          row <= fetch_block_end_row;
          row += blockDim.x) {
@@ -1012,7 +968,7 @@ static __device__ void fetch_blocks_for_row_to_column(
                fetch_index % max_resident_blocks,
                &shared[fetch_index % max_resident_blocks][shared_offset],
                &input_data[row_offsets[row] + starting_col_offset]);
-       // copy the main
+      // copy the main
       cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
                          &input_data[row_offsets[row] + starting_col_offset],
                          fetch_block_row_size,
@@ -1058,7 +1014,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
   // to shared memory for each of the blocks that we work on
 
-  /*constexpr*/ bool debug_print  = false; //threadIdx.x == 0 && blockIdx.x == 0;
+  /*constexpr*/ bool debug_print  = false;  // threadIdx.x == 0 && blockIdx.x == 0;
   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
   auto group                      = cooperative_groups::this_thread_block();
   extern __shared__ int8_t shared_data[];
@@ -1066,14 +1022,17 @@ __global__ void copy_to_columns(const size_type num_rows,
 
   if (debug_print) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x);
+    printf(
+      "%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x);
     /*    printf("Row Offsets:\n");
     for (int i=0; i<num_rows; ++i) {
     printf("%d: %d\n", i, row_offsets[i]);
     }*/
     printf("Row offsets are at %p\n", row_offsets);
     printf("col sizes are at %p and col offsets at %p\n", _col_sizes, _col_offsets);
-    printf("output data to %p and input data at %p\n", output_data[block_infos[blockIdx.x].buffer_num], input_data);
+    printf("output data to %p and input data at %p\n",
+           output_data[block_infos[blockIdx.x].buffer_num],
+           input_data);
     printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
     printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
     printf("group is %d threads\n", group.size());
@@ -1258,7 +1217,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-  bool print_debug = false; //threadIdx.x == 0 && blockIdx.x == 0;
+  bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
   // bool print_debug = false;
   //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
   if (print_debug) {
@@ -1324,7 +1283,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
 
     auto const num_sections_x           = (num_block_cols + 7) / 8;
     auto const num_sections_y           = (num_block_rows + 31) / 32;
-    auto const validity_data_col_length = num_sections_y * 4; // words to bytes
+    auto const validity_data_col_length = num_sections_y * 4;  // words to bytes
     auto const total_sections           = num_sections_x * num_sections_y;
 
     if (print_debug) {
@@ -1363,20 +1322,20 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       auto const absolute_col = relative_col + block_start_col;
       auto const absolute_row = relative_row + block_start_row;
       auto const rows_left    = num_rows - absolute_row;
-  
-/*      if (print_debug)
-        printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
-               threadIdx.x,
-               blockIdx.x,
-               my_section_idx,
-               num_sections_x,
-               num_sections_y,
-               section_x,
-               section_y,
-               absolute_row,
-               num_rows,
-               relative_col,
-               relative_row);*/
+
+      /*      if (print_debug)
+              printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
+                     threadIdx.x,
+                     blockIdx.x,
+                     my_section_idx,
+                     num_sections_x,
+                     num_sections_y,
+                     section_x,
+                     section_y,
+                     absolute_row,
+                     num_rows,
+                     relative_col,
+                     relative_row);*/
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
 
       if (absolute_row < num_rows) {
@@ -1393,15 +1352,22 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
           if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
               validity_data_col_length * (relative_col + i) + relative_row / 8;
-            auto const write_5006_offset = 837; //validity_data_col_length * (65 - block_start_col) + (5006 - block_start_row)/8;
+            auto const write_5006_offset = 837;  // validity_data_col_length * (65 -
+                                                 // block_start_col) + (5006 - block_start_row)/8;
 
             if (print_debug)
-              printf("%d - Writing validity data for column %d, row %d 0x%x to shared memory location %d(%d * (%d + %d) + %d / 8)\n",
-                     threadIdx.x,
-                     absolute_col+i,
-                     absolute_row,
-                     validity_data,
-                     validity_write_offset, validity_data_col_length, relative_col, i, relative_row);
+              printf(
+                "%d - Writing validity data for column %d, row %d 0x%x to shared memory location "
+                "%d(%d * (%d + %d) + %d / 8)\n",
+                threadIdx.x,
+                absolute_col + i,
+                absolute_row,
+                validity_data,
+                validity_write_offset,
+                validity_data_col_length,
+                relative_col,
+                i,
+                relative_row);
 
             if (rows_left <= 8) {
               // write byte
@@ -1436,24 +1402,13 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
 
       if (print_debug)
         printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
-        threadIdx.x, blockIdx.x, col,
+               threadIdx.x,
+               blockIdx.x,
+               col,
                starting_address,
                word_index(block_start_row),
                &this_shared_block[validity_data_col_length * relative_col],
                words_to_copy * 4);
-/*      if (print_debug) {
-        auto const offset_5006 = validity_data_col_length * relative_col +  (5006 - block_start_row) / 8;
-        printf("%d %d - start_row %d end row %d - byte for row 5006 is offset 0x%x - 0x%x\n", threadIdx.x, blockIdx.x, block_start_row, block.end_row, offset_5006, this_shared_block[offset_5006]);
-        printf("relative column is %d and validity_data_col_length is %d making starting offset 0x%x\n", relative_col, validity_data_col_length, validity_data_col_length * relative_col);
-        for (int i=block_start_row; i<block.end_row; i+=8) {
-          auto const offset = validity_data_col_length * relative_col + (i - block_start_row) / 8;
-          if (i % (10 * 8) == 0) {
-            printf("\n");
-          }
-          printf("%4d(%4d) = 0x%X ", offset, i, this_shared_block[offset]);
-        }
-        printf("\n");
-      }*/
       cuda::memcpy_async(
         output_nm[col] + word_index(block_start_row),
         &this_shared_block[validity_data_col_length * relative_col],
@@ -1462,7 +1417,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
     }
   }
 
-  //  if (print_debug) printf("leaving...\n");
   // wait for last blocks of data to arrive
   auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
                                     ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
@@ -1696,7 +1650,7 @@ std::vector<detail::block_info> build_validity_block_infos(
   // note that an element in the table takes just 1 bit, but a row with a single
   // element still takes 8 bytes!
   auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
-  auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
+  auto const row_stride    = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
 
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
@@ -1747,7 +1701,6 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
       }
       int const window_height = std::min(desired_window_height, rows_left_in_batch);
 
-//      printf("block %d, %d to %d, %d\n", start_col, current_window_start_row, end_col, std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1));
       block_infos.emplace_back(detail::block_info{
         start_col,
         current_window_start_row,
@@ -1768,10 +1721,13 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
   // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
   // bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-  int const window_height =
-    std::clamp(util::round_up_safe<int>(std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], total_number_of_rows), 32),
-               1,
-               row_batches[0].row_count);
+  int const window_height            = std::clamp(
+    util::round_up_safe<int>(
+      std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
+               total_number_of_rows),
+      32),
+    1,
+    row_batches[0].row_count);
 #if defined(DEBUG)
   printf(
     "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d "
@@ -2385,8 +2341,6 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
           return util::round_down_safe(desired_rows_and_columns, 32);
         }*/
   }();
-  printf("column stride is %d and row stride is %d. std::min(%d, util::round_down_safe(%d * 8 / %d, 32))\n", column_stride, row_stride, num_rows, shmem_limit_per_block, column_stride);
-  printf("each block uses %d bytes of shared memory\n", (column_stride / 8) * detail::align_offset(row_stride, 4));
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
     for (int row = 0; row < num_rows; row += row_stride) {
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 26e071eef79..70a4552a6f9 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -168,8 +168,8 @@ TEST_F(ColumnToRowTests, Non2Power)
 
   constexpr auto num_rows = 6 * 1024 + 557;
   for (int i = 0; i < 131; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -184,9 +184,9 @@ TEST_F(ColumnToRowTests, Non2Power)
     auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
-    for (int j=0; j<old_tbl->num_columns(); ++j) {
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
       printf("testing column %d\n", j);
-      if (j==65) {
+      if (j == 65) {
         printf("old\n");
         cudf::test::print(old_tbl->get_column(j));
         printf("new\n");
@@ -214,8 +214,8 @@ TEST_F(ColumnToRowTests, Big)
   // 28 columns of 1 million rows
   constexpr auto num_rows = 1024 * 1024;
   for (int i = 0; i < 28; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -241,8 +241,8 @@ TEST_F(ColumnToRowTests, Bigger)
   // 128 columns of 1 million rows
   constexpr auto num_rows = 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -268,8 +268,8 @@ TEST_F(ColumnToRowTests, Biggest)
   // 128 columns of 2 million rows
   constexpr auto num_rows = 2 * 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -396,8 +396,8 @@ TEST_F(RowToColumnTests, Non2Power)
 
   constexpr auto num_rows = 6 * 1024 + 557;
   for (int i = 0; i < 131; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -424,8 +424,8 @@ TEST_F(RowToColumnTests, Big)
   // 28 columns of 1 million rows
   constexpr auto num_rows = 1024 * 1024;
   for (int i = 0; i < 28; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -452,8 +452,8 @@ TEST_F(RowToColumnTests, Bigger)
   // 28 columns of 1 million rows
   constexpr auto num_rows = 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
@@ -480,8 +480,8 @@ TEST_F(RowToColumnTests, Biggest)
   // 28 columns of 1 million rows
   constexpr auto num_rows = 5 * 1024 * 1024;
   for (int i = 0; i < 128; ++i) {
-    cols.push_back(
-      cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i, r + num_rows * i + num_rows));
+    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
+                                                                   r + num_rows * i + num_rows));
     views.push_back(cols.back());
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 9f0df3569a7..c64a61b3373 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -54,9 +54,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 #endif
 
 using cudf::detail::make_device_uvector_async;
-using cudf::detail::warp_size;
-
-namespace cudf::java {
+namespace cudf {
 
 namespace detail {
 
@@ -403,7 +401,6 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     // Fetch ahead up to stages_count subsets
     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
       auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
-
       auto const num_fetch_cols = fetch_block.num_cols();
       auto const num_fetch_rows = fetch_block.num_rows();
       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
@@ -435,7 +432,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
         auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
 
-        // copy the main
+        // copy the element to global memory
         cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size,
                            fetch_barrier);
       }
@@ -445,18 +442,19 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     subset_barrier.arrive_and_wait();
 
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-    /*    auto const rows_in_block  = block.num_rows();
-        auto const cols_in_block  = block.num_cols();*/
+
     auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
 
     // copy entire rows to final dest
     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
          absolute_row += blockDim.x) {
+
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
           output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
       auto const shared_offset = block_row_size * relative_row;
+
       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
                          subset_barrier);
     }
@@ -528,23 +526,22 @@ __global__ void copy_validity_from_columns(
         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
 
-    int const warp_id = threadIdx.x / warp_size;
-    int const lane_id = threadIdx.x % warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
+    int const warp_id = threadIdx.x / detail::warp_size;
+    int const lane_id = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
          my_section_idx += warps_per_block) {
-      // convert to rows and cols
-      auto const section_x = my_section_idx / num_sections_x;
-      auto const section_y = my_section_idx % num_sections_x;
 
+      // convert to rows and cols
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
       auto const relative_col = section_x * 32 + lane_id;
       auto const relative_row = section_y * 8;
       auto const absolute_col = relative_col + block.start_col;
       auto const absolute_row = relative_row + block.start_row;
       auto const cols_left = num_columns - absolute_col;
-
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
 
       if (absolute_col < num_columns) {
@@ -552,14 +549,14 @@ __global__ void copy_validity_from_columns(
             input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
 
         // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make
+        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make
         // the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
-          if (threadIdx.x % warp_size == 0) {
+          if (threadIdx.x % detail::warp_size == 0) {
             if (cols_left <= 8) {
               // write byte
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
@@ -591,6 +588,7 @@ __global__ void copy_validity_from_columns(
       auto const output_ptr =
           output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+
       cuda::memcpy_async(
           output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes,
           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
@@ -647,7 +645,6 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde
     auto const fetch_block_start_row = fetch_block.start_row;
     auto const fetch_block_end_row = fetch_block.end_row;
     auto const starting_col_offset = col_offsets[fetch_block.start_col];
-
     auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
     auto const num_fetch_cols = fetch_block.num_cols();
     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
@@ -718,9 +715,9 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
   extern __shared__ int8_t shared_data[];
   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
 
-  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[stages_count];
+  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
   if (group.thread_rank() == 0) {
-    for (int i = 0; i < stages_count; ++i) {
+    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
       init(&block_barrier[i], group.size());
     }
   }
@@ -748,12 +745,11 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
                                    block_infos, _col_sizes, _col_offsets, row_offsets, input_data,
                                    shared, group, block_barrier);
 
-    auto &subset_barrier = block_barrier[subset % stages_count];
+    auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     // ensure our data is ready
     subset_barrier.arrive_and_wait();
 
-    auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
     auto const rows_in_block = block.num_rows();
     auto const cols_in_block = block.num_cols();
 
@@ -851,18 +847,15 @@ __global__ void copy_validity_to_columns(
     auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
     auto const block_start_col = block.start_col;
     auto const block_start_row = block.start_row;
-
     auto const num_block_cols = block.num_cols();
     auto const num_block_rows = block.num_rows();
-
     auto const num_sections_x = (num_block_cols + 7) / 8;
     auto const num_sections_y = (num_block_rows + 31) / 32;
     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections = num_sections_x * num_sections_y;
-
-    int const warp_id = threadIdx.x / warp_size;
-    int const lane_id = threadIdx.x % warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
+    int const warp_id = threadIdx.x / detail::warp_size;
+    int const lane_id = threadIdx.x % detail::warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
@@ -870,7 +863,6 @@ __global__ void copy_validity_to_columns(
       // convert to rows and cols
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
-
       auto const relative_col = section_x * 8;
       auto const relative_row = section_y * 32 + lane_id;
       auto const absolute_col = relative_col + block_start_col;
@@ -890,9 +882,11 @@ __global__ void copy_validity_to_columns(
              ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
-          if (threadIdx.x % warp_size == 0) {
+          if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
+            auto const write_5006_offset = 837; // validity_data_col_length * (65 - block_start_col)
+                                                // + (5006 - block_start_row)/8;
 
             if (rows_left <= 8) {
               // write byte
@@ -922,6 +916,8 @@ __global__ void copy_validity_to_columns(
     // now async memcpy the shared
     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
       auto const relative_col = col - block.start_col;
+      auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32);
+      auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       cuda::memcpy_async(
           output_nm[col] + word_index(block_start_row),
@@ -965,8 +961,9 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
   int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4);
-  if (y_block_size > 32)
+  if (y_block_size > 32) {
     y_block_size = 32;
+  }
   int x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
   // If someone configures the GPU to only have 16 KB this might not work.
@@ -1135,7 +1132,10 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro
       }(),
       8);
   // we fit as much as we can given the column stride
-  auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride);
+  // note that an element in the table takes just 1 bit, but a row with a single
+  // element still takes 8 bytes!
+  auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
+  auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
 
   std::vector<detail::block_info> validity_block_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
@@ -1203,13 +1203,12 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
   // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
   // bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-  int const window_height =
-      std::clamp(util::round_up_safe<int>(
-                     optimal_square_len <= (size_type)column_sizes.size() ?
-                         std::min(optimal_square_len / column_sizes[0], total_number_of_rows) :
-                         row_batches[0].row_count / 2,
-                     32),
-                 1, row_batches[0].row_count);
+  int const window_height = std::clamp(
+      util::round_up_safe<int>(
+          std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
+                   total_number_of_rows),
+          32),
+      1, row_batches[0].row_count);
 
   auto calc_admin_data_size = [](int num_cols) -> size_type {
     // admin data is the column sizes and column start information.
@@ -1233,8 +1232,9 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
     if (row_size_with_end_pad * window_height +
             calc_admin_data_size(col - current_window_start_col) >
         shmem_limit_per_block) {
+
       // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col - 1, window_height);
+      build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
       row_size =
           detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
       row_size += col_size; // alignment required for shared memory window boundary to match
@@ -1274,9 +1274,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   int total_shmem;
   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  // TODO: kernels fail to launch if we use all the available shared memory.
+  // TODO: why?
   total_shmem -= 1024;
-
   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
 
   // break up the work into blocks, which are a starting and ending row/col #.
@@ -1381,6 +1380,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
             }
             c = c.child(1);
           }
+          exclusive_scan([t](int row_index) {
+            size_type total_row_size = 0;
+            for (int i=0 i<t.num_columns(); ++i) {
+              // compute data prior to validity
+              data_size += compute_type_size();
+              // compute validity size
+              total_row_size += num_columns() / 8;
+              total_row_size = align(data_size + bit_size + variable_size);
+            }
+          }
     */
   };
 
@@ -1392,6 +1401,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
   auto validity_size = num_bitmask_words(num_columns) * 4;
+  // thrust
   for (int row = 0; row < num_rows; ++row) {
     auto aligned_row_batch_size =
         detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
@@ -1578,7 +1588,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   int total_shmem;
   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  // TODO: unable to launch a kernel with all shared used
+  // TODO why?
   total_shmem -= 1024;
   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
 
@@ -1628,11 +1638,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
-#if defined(DEBUG)
-  dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size()));
-#else
-  dim3 threads(std::min(256, (int)child.size()));
-#endif
+  dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
   detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
       dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
@@ -1641,8 +1647,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
   auto const column_stride = [&]() {
     if (desired_rows_and_columns > num_columns) {
-      // not many columns, group it into 8s and ship it off
-      return std::min(8, num_columns);
+      // not many columns, group it into 64s and ship it off
+      return std::min(64, num_columns);
     } else {
       return util::round_down_safe(desired_rows_and_columns, 8);
     }

From d563eaa8443f4e4e8834ac80b2010360a3040425 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Fri, 1 Oct 2021 15:14:54 +0000
Subject: [PATCH 52/80] Fixing merge issue

---
 cpp/benchmarks/CMakeLists.txt | 54 ++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index a8f075d2464..79783f0e512 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -22,21 +22,10 @@ target_compile_options(
                       "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
 
-<<<<<<< HEAD
 target_link_libraries(
   cudf_datagen PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
                       benchmark::benchmark nvbench::nvbench Threads::Threads cudf
 )
-=======
-target_link_libraries(cudf_datagen
-               PUBLIC GTest::gmock
-                      GTest::gtest
-                      GTest::gmock_main
-                      GTest::gtest_main
-                      benchmark::benchmark
-                      Threads::Threads
-                      cudf)
->>>>>>> working on row and column conversions
 
 target_include_directories(
   cudf_datagen
@@ -57,7 +46,6 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen)
 # This function takes in a benchmark name and benchmark source and handles setting all of the
 # associated properties and linking to build the benchmark
 function(ConfigureBench CMAKE_BENCH_NAME)
-<<<<<<< HEAD
   add_executable(${CMAKE_BENCH_NAME} ${ARGN})
   set_target_properties(
     ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
@@ -83,17 +71,6 @@ endfunction()
 
 # ##################################################################################################
 # * column benchmarks -----------------------------------------------------------------------------
-=======
-    add_executable(${CMAKE_BENCH_NAME} ${ARGN})
-    set_target_properties(${CMAKE_BENCH_NAME}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gbenchmarks>")
-    target_link_libraries(${CMAKE_BENCH_NAME}
-        PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main)
-endfunction()
-
-###################################################################################################
-# - column benchmarks -----------------------------------------------------------------------------
->>>>>>> working on row and column conversions
 ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp)
 
 # ##################################################################################################
@@ -104,17 +81,12 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu)
 # * scatter benchmark -----------------------------------------------------------------------------
 ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu)
 
-<<<<<<< HEAD
 # ##################################################################################################
 # * lists scatter benchmark -----------------------------------------------------------------------
 ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu)
 
 # ##################################################################################################
 # * contiguous_split benchmark  -------------------------------------------------------------------
-=======
-###################################################################################################
-# - contiguous_split benchmark  -------------------------------------------------------------------
->>>>>>> working on row and column conversions
 ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
 
 # ##################################################################################################
@@ -146,8 +118,13 @@ ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
 =======
 ###################################################################################################
 # - join benchmark --------------------------------------------------------------------------------
+<<<<<<< HEAD
 ConfigureBench(JOIN_BENCH join/join_benchmark.cu)
 >>>>>>> working on row and column conversions
+=======
+ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
+ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
+>>>>>>> Fixing merge issue
 
 # ##################################################################################################
 # * iterator benchmark ----------------------------------------------------------------------------
@@ -238,6 +215,7 @@ ConfigureBench(CSV_WRITER_BENCH io/csv/csv_writer_benchmark.cpp)
 # * ast benchmark ---------------------------------------------------------------------------------
 ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 
+<<<<<<< HEAD
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
 ConfigureBench(
@@ -249,6 +227,18 @@ ConfigureBench(
 # * nvtext benchmark -------------------------------------------------------------------
 ConfigureBench(
   TEXT_BENCH
+=======
+###################################################################################################
+# - binaryop benchmark ----------------------------------------------------------------------------
+ConfigureBench(BINARYOP_BENCH
+  binaryop/binaryop_benchmark.cpp
+  binaryop/compiled_binaryop_benchmark.cpp
+  binaryop/jit_binaryop_benchmark.cpp)
+
+###################################################################################################
+# - nvtext benchmark -------------------------------------------------------------------
+ConfigureBench(TEXT_BENCH
+>>>>>>> Fixing merge issue
   text/ngrams_benchmark.cpp
   text/normalize_benchmark.cpp
   text/normalize_spaces_benchmark.cpp
@@ -273,6 +263,7 @@ ConfigureBench(
   string/factory_benchmark.cu
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
+  string/repeat_strings_benchmark.cpp
   string/replace_benchmark.cpp
   string/replace_re_benchmark.cpp
   string/split_benchmark.cpp
@@ -291,6 +282,11 @@ ConfigureBench(JSON_BENCH string/json_benchmark.cpp)
 ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp)
 =======
 ###################################################################################################
-# - row conversion benchmark ----------------------------------------------------------------------------
+# - io benchmark ---------------------------------------------------------------------
+ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
+  io/text/multibyte_split_benchmark.cpp)
+
+###################################################################################################
+# - row conversion benchmark ---------------------------------------------------------
 ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp)
 >>>>>>> working on row and column conversions

From 5b6688db4a790947f9c7ffb5c9e7cb5f73c4124d Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Fri, 1 Oct 2021 15:17:11 +0000
Subject: [PATCH 53/80] working on code to move block creation and batch
 creation to gpu

---
 cpp/src/row_conversion/row_conversion.cu    | 180 +++++++++++++++++++-
 cpp/tests/row_conversion/row_conversion.cpp |   7 -
 2 files changed, 178 insertions(+), 9 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index eb3c4b28b6a..ae218e637d0 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -20,6 +20,8 @@
 #include <limits>
 #include <tuple>
 #include <type_traits>
+#include "cudf/detail/iterator.cuh"
+#include "cudf/lists/lists_column_device_view.cuh"
 
 #include <cooperative_groups.h>
 
@@ -43,7 +45,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -56,6 +60,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 #endif
 
 using cudf::detail::make_device_uvector_async;
+using rmm::device_uvector;
 namespace cudf {
 
 namespace detail {
@@ -1352,8 +1357,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
           if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
               validity_data_col_length * (relative_col + i) + relative_row / 8;
-            auto const write_5006_offset = 837;  // validity_data_col_length * (65 -
-                                                 // block_start_col) + (5006 - block_start_row)/8;
 
             if (print_debug)
               printf(
@@ -1674,6 +1677,173 @@ std::vector<detail::block_info> build_validity_block_infos(
   return validity_block_infos;
 }
 
+constexpr size_t max_batch_size = 1024;  // 2ul * 1024 * 1024 * 1024;
+
+template <typename CumulativeRowSize>
+void build_batches(size_t total_size,
+                   size_type num_rows,
+                   CumulativeRowSize cumulative_row_size,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+{
+  auto const num_batches = ((total_size + (max_batch_size - 1)) / max_batch_size);
+  auto const num_offsets = num_batches + 1;
+  printf("%lu batches so %lu offsets\n", num_batches, num_offsets);
+
+  // at most max gpu memory / 2GB iterations.
+  std::vector<size_type> h_batch_row_offsets;
+  h_batch_row_offsets.reserve(num_offsets);
+  h_batch_row_offsets.push_back(0);
+  size_type last_row_end = 0;
+  while (h_batch_row_offsets.size() < num_batches) {
+    // subtract out the size of the last row in the previous batch
+    auto adjusted_row_size =
+      thrust::make_transform_iterator(cumulative_row_size + last_row_end,
+                                      [last_row_end, cumulative_row_size] __device__(size_t size) {
+                                        return size - cumulative_row_size[last_row_end];
+                                      });
+    // find the next max_batch_size boundary
+    size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream),
+                                                    adjusted_row_size,
+                                                    adjusted_row_size + (num_rows - last_row_end),
+                                                    max_batch_size) -
+                                adjusted_row_size) +
+                               last_row_end) -
+                              1;
+
+    h_batch_row_offsets.push_back(row_end);
+    last_row_end = row_end;
+  }
+  printf("batches: ");
+  for (uint i = 0; i < h_batch_row_offsets.size(); ++i) {
+    printf("%d ", h_batch_row_offsets[i]);
+  }
+  printf("\n");
+}
+
+int compute_block_counts(device_uvector<size_type> const& batch_row_offsets,
+                         int desired_window_height,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+{
+  size_type const num_batches = batch_row_offsets.size() - 1;
+  device_uvector<size_type> num_blocks(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    iter,
+    iter + num_batches,
+    num_blocks.begin(),
+    [desired_window_height,
+     batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
+      return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) /
+             desired_window_height;
+    });
+  return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+}
+
+size_type block_lambda(
+  block_info* blocks,
+  device_uvector<size_type> const& batch_row_offsets,  // comes from build_batches
+  int column_start,
+  int column_end,
+  int desired_window_height,
+  int total_number_of_rows,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  size_type const num_batches = batch_row_offsets.size() - 1;
+  device_uvector<size_type> num_blocks(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    iter,
+    iter + num_batches,
+    num_blocks.begin(),
+    [=, batch_row_offsets = batch_row_offsets.data()] __device__(int batch_index) -> size_type {
+      return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) /
+             desired_window_height;
+    });
+  size_type const total_blocks =
+    thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+  device_uvector<size_type> block_starts(num_batches, stream);
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         num_blocks.begin(),
+                         num_blocks.end(),
+                         block_starts.begin());  // in blocks
+
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    iter,
+    iter + total_blocks,
+    [                  =,
+     block_starts      = block_starts.data(),
+     batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) {
+      block_info& bi = blocks[block_index];
+
+      // what batch this block falls in
+      auto const batch_index_iter =
+        thrust::lower_bound(thrust::seq, block_starts, block_starts + num_batches, block_index);
+      auto const batch_index = batch_index_iter == block_starts ? 0 : *batch_index_iter;
+      // local index within the block
+      int const local_block_index = block_index - block_starts[batch_index];
+      // the start row for this batch.
+      int const batch_row_start = batch_row_offsets[batch_index];
+      // the start row for this block
+      int const block_row_start = batch_row_start + (local_block_index * desired_window_height);
+      // the end row for this block
+      int const max_row = std::min(total_number_of_rows,
+                                   batch_index + 1 > num_batches
+                                     ? std::numeric_limits<int>::max()
+                                     : static_cast<int>(batch_row_offsets[batch_index + 1]));
+      int const block_row_end =
+        std::min(batch_row_start + ((local_block_index + 1) * desired_window_height) - 1,
+                 total_number_of_rows);
+
+      // stuff the block
+      bi.start_col  = column_start;
+      bi.end_col    = column_end;
+      bi.start_row  = block_row_start;
+      bi.end_row    = block_row_end;
+      bi.buffer_num = batch_index;
+    });
+
+  return total_blocks;
+}
+
+void test_block_lambda(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+{
+  device_uvector<size_type> batch_row_offsets(3, stream);
+  batch_row_offsets.set_element(0, 0, stream);
+  batch_row_offsets.set_element(1, 2000, stream);
+  batch_row_offsets.set_element(2, 5000, stream);
+
+  // three groups of columns that can hold 128, 1024, and 768 rows each.
+  auto const total_blocks = compute_block_counts(batch_row_offsets, 128, stream, mr) +
+                            compute_block_counts(batch_row_offsets, 1024, stream, mr) +
+                            compute_block_counts(batch_row_offsets, 768, stream, mr);
+
+  auto const table_num_rows = 50 * 1024;
+
+  // allocate memory for all blocks
+  device_uvector<block_info> blocks(total_blocks, stream);
+
+  auto used_blocks =
+    block_lambda(blocks.data(), batch_row_offsets, 0, 15, 128, table_num_rows, stream, mr);
+  used_blocks += block_lambda(
+    blocks.data() + used_blocks, batch_row_offsets, 16, 28, 1024, table_num_rows, stream, mr);
+  used_blocks += block_lambda(
+    blocks.data() + used_blocks, batch_row_offsets, 29, 32, 768, table_num_rows, stream, mr);
+
+  CUDF_EXPECTS(used_blocks == total_blocks, "used not equal to total!");
+
+  for (int i = 0; i < total_blocks; ++i) {
+    auto const block = blocks.element(i, stream);
+    printf(
+      "%d: %d,%d -> %d,%d\n", i, block.start_col, block.start_row, block.end_col, block.end_row);
+  }
+}
+
 std::vector<block_info> build_block_infos(std::vector<size_type> const& column_sizes,
                                           std::vector<size_type> const& column_starts,
                                           std::vector<row_batch> const& row_batches,
@@ -2245,6 +2415,12 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
   cudf::size_type num_columns = schema.size();
   cudf::size_type num_rows    = input.parent().size();
 
+  auto cumulative_row_size = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(size_t row_index) { return 300 * row_index; });
+  detail::build_batches(1024 * 1024, 1024, cumulative_row_size, stream, mr);
+
+  detail::test_block_lambda(stream, mr);
+
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem;
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 70a4552a6f9..48d9690d583 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -185,13 +185,6 @@ TEST_F(ColumnToRowTests, Non2Power)
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      printf("testing column %d\n", j);
-      if (j == 65) {
-        printf("old\n");
-        cudf::test::print(old_tbl->get_column(j));
-        printf("new\n");
-        cudf::test::print(new_tbl->get_column(j));
-      }
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
     }
 

From 53912ca1b9a786d9ae4c3cb7241d2ff87bd1781c Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 6 Oct 2021 19:41:49 +0000
Subject: [PATCH 54/80] pulling incomplete code for gpu building block data

---
 cpp/src/row_conversion/row_conversion.cu   | 173 ---------------------
 java/src/main/native/src/row_conversion.cu |  53 +------
 2 files changed, 6 insertions(+), 220 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index ae218e637d0..9674000a69d 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -1677,173 +1677,6 @@ std::vector<detail::block_info> build_validity_block_infos(
   return validity_block_infos;
 }
 
-constexpr size_t max_batch_size = 1024;  // 2ul * 1024 * 1024 * 1024;
-
-template <typename CumulativeRowSize>
-void build_batches(size_t total_size,
-                   size_type num_rows,
-                   CumulativeRowSize cumulative_row_size,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-{
-  auto const num_batches = ((total_size + (max_batch_size - 1)) / max_batch_size);
-  auto const num_offsets = num_batches + 1;
-  printf("%lu batches so %lu offsets\n", num_batches, num_offsets);
-
-  // at most max gpu memory / 2GB iterations.
-  std::vector<size_type> h_batch_row_offsets;
-  h_batch_row_offsets.reserve(num_offsets);
-  h_batch_row_offsets.push_back(0);
-  size_type last_row_end = 0;
-  while (h_batch_row_offsets.size() < num_batches) {
-    // subtract out the size of the last row in the previous batch
-    auto adjusted_row_size =
-      thrust::make_transform_iterator(cumulative_row_size + last_row_end,
-                                      [last_row_end, cumulative_row_size] __device__(size_t size) {
-                                        return size - cumulative_row_size[last_row_end];
-                                      });
-    // find the next max_batch_size boundary
-    size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream),
-                                                    adjusted_row_size,
-                                                    adjusted_row_size + (num_rows - last_row_end),
-                                                    max_batch_size) -
-                                adjusted_row_size) +
-                               last_row_end) -
-                              1;
-
-    h_batch_row_offsets.push_back(row_end);
-    last_row_end = row_end;
-  }
-  printf("batches: ");
-  for (uint i = 0; i < h_batch_row_offsets.size(); ++i) {
-    printf("%d ", h_batch_row_offsets[i]);
-  }
-  printf("\n");
-}
-
-int compute_block_counts(device_uvector<size_type> const& batch_row_offsets,
-                         int desired_window_height,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
-{
-  size_type const num_batches = batch_row_offsets.size() - 1;
-  device_uvector<size_type> num_blocks(num_batches, stream);
-  auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    iter,
-    iter + num_batches,
-    num_blocks.begin(),
-    [desired_window_height,
-     batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
-      return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) /
-             desired_window_height;
-    });
-  return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
-}
-
-size_type block_lambda(
-  block_info* blocks,
-  device_uvector<size_type> const& batch_row_offsets,  // comes from build_batches
-  int column_start,
-  int column_end,
-  int desired_window_height,
-  int total_number_of_rows,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  size_type const num_batches = batch_row_offsets.size() - 1;
-  device_uvector<size_type> num_blocks(num_batches, stream);
-  auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    iter,
-    iter + num_batches,
-    num_blocks.begin(),
-    [=, batch_row_offsets = batch_row_offsets.data()] __device__(int batch_index) -> size_type {
-      return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) /
-             desired_window_height;
-    });
-  size_type const total_blocks =
-    thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
-  device_uvector<size_type> block_starts(num_batches, stream);
-  thrust::exclusive_scan(rmm::exec_policy(stream),
-                         num_blocks.begin(),
-                         num_blocks.end(),
-                         block_starts.begin());  // in blocks
-
-  thrust::for_each(
-    rmm::exec_policy(stream),
-    iter,
-    iter + total_blocks,
-    [                  =,
-     block_starts      = block_starts.data(),
-     batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) {
-      block_info& bi = blocks[block_index];
-
-      // what batch this block falls in
-      auto const batch_index_iter =
-        thrust::lower_bound(thrust::seq, block_starts, block_starts + num_batches, block_index);
-      auto const batch_index = batch_index_iter == block_starts ? 0 : *batch_index_iter;
-      // local index within the block
-      int const local_block_index = block_index - block_starts[batch_index];
-      // the start row for this batch.
-      int const batch_row_start = batch_row_offsets[batch_index];
-      // the start row for this block
-      int const block_row_start = batch_row_start + (local_block_index * desired_window_height);
-      // the end row for this block
-      int const max_row = std::min(total_number_of_rows,
-                                   batch_index + 1 > num_batches
-                                     ? std::numeric_limits<int>::max()
-                                     : static_cast<int>(batch_row_offsets[batch_index + 1]));
-      int const block_row_end =
-        std::min(batch_row_start + ((local_block_index + 1) * desired_window_height) - 1,
-                 total_number_of_rows);
-
-      // stuff the block
-      bi.start_col  = column_start;
-      bi.end_col    = column_end;
-      bi.start_row  = block_row_start;
-      bi.end_row    = block_row_end;
-      bi.buffer_num = batch_index;
-    });
-
-  return total_blocks;
-}
-
-void test_block_lambda(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
-{
-  device_uvector<size_type> batch_row_offsets(3, stream);
-  batch_row_offsets.set_element(0, 0, stream);
-  batch_row_offsets.set_element(1, 2000, stream);
-  batch_row_offsets.set_element(2, 5000, stream);
-
-  // three groups of columns that can hold 128, 1024, and 768 rows each.
-  auto const total_blocks = compute_block_counts(batch_row_offsets, 128, stream, mr) +
-                            compute_block_counts(batch_row_offsets, 1024, stream, mr) +
-                            compute_block_counts(batch_row_offsets, 768, stream, mr);
-
-  auto const table_num_rows = 50 * 1024;
-
-  // allocate memory for all blocks
-  device_uvector<block_info> blocks(total_blocks, stream);
-
-  auto used_blocks =
-    block_lambda(blocks.data(), batch_row_offsets, 0, 15, 128, table_num_rows, stream, mr);
-  used_blocks += block_lambda(
-    blocks.data() + used_blocks, batch_row_offsets, 16, 28, 1024, table_num_rows, stream, mr);
-  used_blocks += block_lambda(
-    blocks.data() + used_blocks, batch_row_offsets, 29, 32, 768, table_num_rows, stream, mr);
-
-  CUDF_EXPECTS(used_blocks == total_blocks, "used not equal to total!");
-
-  for (int i = 0; i < total_blocks; ++i) {
-    auto const block = blocks.element(i, stream);
-    printf(
-      "%d: %d,%d -> %d,%d\n", i, block.start_col, block.start_row, block.end_col, block.end_row);
-  }
-}
-
 std::vector<block_info> build_block_infos(std::vector<size_type> const& column_sizes,
                                           std::vector<size_type> const& column_starts,
                                           std::vector<row_batch> const& row_batches,
@@ -2415,12 +2248,6 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
   cudf::size_type num_columns = schema.size();
   cudf::size_type num_rows    = input.parent().size();
 
-  auto cumulative_row_size = cudf::detail::make_counting_transform_iterator(
-    0, [] __device__(size_t row_index) { return 300 * row_index; });
-  detail::build_batches(1024 * 1024, 1024, cumulative_row_size, stream, mr);
-
-  detail::test_block_lambda(stream, mr);
-
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem;
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index c64a61b3373..481787c6004 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -21,6 +21,8 @@
 #include <tuple>
 
 #include <cooperative_groups.h>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
@@ -42,6 +44,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -54,6 +58,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 #endif
 
 using cudf::detail::make_device_uvector_async;
+using rmm::device_uvector;
 namespace cudf {
 
 namespace detail {
@@ -885,8 +890,6 @@ __global__ void copy_validity_to_columns(
           if (threadIdx.x % detail::warp_size == 0) {
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
-            auto const write_5006_offset = 837; // validity_data_col_length * (65 - block_start_col)
-                                                // + (5006 - block_start_row)/8;
 
             if (rows_left <= 8) {
               // write byte
@@ -1330,28 +1333,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
                                       });
 
   size_type fixed_width_size_per_row =
-      detail::compute_column_information(iter, iter + num_columns, column_starts,
-                                         column_sizes); //,
-  //    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
-  /*  size_type fixed_width_size_per_row = 0;
-    for (int col = 0; col < num_columns; ++col) {
-      auto cv          = tbl.column(col);
-      auto col_type    = cv.type();
-      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-      if (nested_type) { variable_width_columns.push_back(cv); }
-
-      // a list or string column will write a single uint64
-      // of data here for offset/length
-      auto col_size = nested_type ? 8 : size_of(col_type);
-
-      // align size for this type
-      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-      column_starts.push_back(fixed_width_size_per_row);
-      column_sizes.push_back(col_size);
-      fixed_width_size_per_row += col_size;
-    }*/
+      detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
 
   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
@@ -1368,29 +1350,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     // will be included in the variable-width data blob at the end of the
     // row.
     return 0;
-    /*      auto c = variable_width_columns[col];
-            while (true) {
-              auto col_offsets   = c.child(0).data<size_type>();
-              auto col_data_size = size_of(c.child(1).type());
-              std::size_t alignment_needed  = col_data_size;
-
-            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-            if (c.num_children() == 0) {
-              break;
-            }
-            c = c.child(1);
-          }
-          exclusive_scan([t](int row_index) {
-            size_type total_row_size = 0;
-            for (int i=0 i<t.num_columns(); ++i) {
-              // compute data prior to validity
-              data_size += compute_type_size();
-              // compute validity size
-              total_row_size += num_columns() / 8;
-              total_row_size = align(data_size + bit_size + variable_size);
-            }
-          }
-    */
   };
 
   uint64_t row_batch_size = 0;

From 698817a1bcd74e31ce5b0acb769a7202a2250843 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 7 Oct 2021 04:01:36 +0000
Subject: [PATCH 55/80] Fixing issue Raza found with 8-byte data

---
 cpp/src/row_conversion/row_conversion.cu    |  27 +++--
 cpp/tests/row_conversion/row_conversion.cpp | 122 ++++++++++++++++----
 java/src/main/native/src/row_conversion.cu  |  23 ++--
 3 files changed, 132 insertions(+), 40 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 9674000a69d..84fab20fce5 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -333,9 +333,9 @@ struct block_info {
   int end_row;
   int buffer_num;
 
-  __host__ __device__ size_type get_row_size(size_type const* const col_offsets,
-                                             size_type const* const col_sizes,
-                                             bool debug_print = false) const
+  __host__ __device__ size_type get_shared_row_size(size_type const* const col_offsets,
+                                                    size_type const* const col_sizes,
+                                                    bool debug_print = false) const
   {
     if (debug_print)
       printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n",
@@ -350,6 +350,14 @@ struct block_info {
              col_offsets[start_col]);
     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
   }
+  __host__ __device__ size_type get_dest_row_size(size_type const* const col_offsets,
+                                                  size_type const* const col_sizes,
+                                                  bool debug_print = false) const
+  {
+    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] +
+                          util::div_rounding_up_unsafe(num_cols(), 8),
+                        8);
+  }
   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
 
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
@@ -456,7 +464,7 @@ __global__ void copy_from_columns(const size_type num_rows,
       auto const num_fetch_cols         = fetch_block.num_cols();
       auto const num_fetch_rows         = fetch_block.num_rows();
       auto const num_elements_in_block  = num_fetch_cols * num_fetch_rows;
-      auto const fetch_block_row_size   = fetch_block.get_row_size(col_offsets, col_sizes);
+      auto const fetch_block_row_size   = fetch_block.get_shared_row_size(col_offsets, col_sizes);
       auto const starting_column_offset = col_offsets[fetch_block.start_col];
       auto& fetch_barrier               = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
 
@@ -513,7 +521,8 @@ __global__ void copy_from_columns(const size_type num_rows,
 
     /*    auto const rows_in_block  = block.num_rows();
         auto const cols_in_block  = block.num_cols();*/
-    auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
+    auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
+    auto const dest_row_size  = block.get_dest_row_size(col_offsets, col_sizes);
     auto const column_offset  = col_offsets[block.start_col];
 
     // copy entire rows to final dest
@@ -521,7 +530,7 @@ __global__ void copy_from_columns(const size_type num_rows,
          absolute_row += blockDim.x) {
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
-        output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
+        output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset;
       if (debug_print)
         printf("processing row %d\noutput data[%d] is address %p\n",
                absolute_row,
@@ -918,8 +927,8 @@ static __device__ void fetch_blocks_for_row_to_column(
     auto const fetch_block_end_row   = fetch_block.end_row;
     auto const starting_col_offset   = col_offsets[fetch_block.start_col];
 
-    auto const fetch_block_row_size         = fetch_block.get_row_size(col_offsets, col_sizes);
-    auto const num_fetch_cols               = fetch_block.num_cols();
+    auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
+    auto const num_fetch_cols       = fetch_block.num_cols();
     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
       sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
     auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
@@ -1115,7 +1124,7 @@ __global__ void copy_to_columns(const size_type num_rows,
 
     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
 
-    auto block_row_size = block.get_row_size(_col_offsets, _col_sizes, debug_print);
+    auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes, debug_print);
 
     // now we copy from shared memory to final destination.
     // the data is laid out in rows in shared memory, so the reads
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 48d9690d583..0ab8b70a0f7 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -46,9 +46,9 @@ TEST_F(ColumnToRowTests, Single)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Simple)
@@ -68,9 +68,9 @@ TEST_F(ColumnToRowTests, Simple)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Tall)
@@ -93,9 +93,9 @@ TEST_F(ColumnToRowTests, Tall)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Wide)
@@ -122,9 +122,9 @@ TEST_F(ColumnToRowTests, Wide)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, SingleByteWide)
@@ -153,9 +153,9 @@ TEST_F(ColumnToRowTests, SingleByteWide)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Non2Power)
@@ -191,9 +191,9 @@ TEST_F(ColumnToRowTests, Non2Power)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-  }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Big)
@@ -218,9 +218,21 @@ TEST_F(ColumnToRowTests, Big)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
+
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Bigger)
@@ -245,9 +257,20 @@ TEST_F(ColumnToRowTests, Bigger)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
+
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(ColumnToRowTests, Biggest)
@@ -272,9 +295,20 @@ TEST_F(ColumnToRowTests, Biggest)
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); i++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+
+    for (int j = 0; j < old_tbl->num_columns(); ++j) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
+    }
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
+  /*  for (uint i = 0; i < old_rows.size(); i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
+    }*/
 }
 
 TEST_F(RowToColumnTests, Single)
@@ -379,6 +413,46 @@ TEST_F(RowToColumnTests, SingleByteWide)
   }
 }
 
+TEST_F(RowToColumnTests, Raza)
+{
+  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
+  std::vector<cudf::column_view> views;
+  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT64},
+                                      cudf::data_type{cudf::type_id::FLOAT64},
+                                      cudf::data_type{cudf::type_id::INT8},
+                                      cudf::data_type{cudf::type_id::BOOL8},
+                                      cudf::data_type{cudf::type_id::FLOAT32},
+                                      cudf::data_type{cudf::type_id::INT8},
+                                      cudf::data_type{cudf::type_id::INT32},
+                                      cudf::data_type{cudf::type_id::INT64}};
+
+  cudf::test::fixed_width_column_wrapper<int64_t> c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<double> c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0},
+                                                    {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> c3({true, false, false, true, false, false},
+                                                  {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<float> c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f},
+                                                   {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_point_column_wrapper<int32_t> c6(
+    {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2});
+  cudf::test::fixed_point_column_wrapper<int64_t> c7(
+    {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1});
+
+  cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7});
+
+  auto old_rows = cudf::old_convert_to_rows(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
 TEST_F(RowToColumnTests, Non2Power)
 {
   auto r =
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 481787c6004..1808c7534df 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -330,10 +330,18 @@ struct block_info {
   int end_row;
   int buffer_num;
 
-  __host__ __device__ size_type get_row_size(size_type const *const col_offsets,
-                                             size_type const *const col_sizes) const {
+  __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
+                                                    size_type const *const col_sizes,
+                                                    bool debug_print = false) const {
     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
   }
+  __host__ __device__ size_type get_dest_row_size(size_type const *const col_offsets,
+                                                  size_type const *const col_sizes,
+                                                  bool debug_print = false) const {
+    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] +
+                            util::div_rounding_up_unsafe(num_cols(), 8),
+                        8);
+  }
   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
 
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
@@ -409,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
       auto const num_fetch_cols = fetch_block.num_cols();
       auto const num_fetch_rows = fetch_block.num_rows();
       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
-      auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
+      auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
       auto const starting_column_offset = col_offsets[fetch_block.start_col];
       auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
 
@@ -448,7 +456,8 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
 
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
 
-    auto const block_row_size = block.get_row_size(col_offsets, col_sizes);
+    auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
+    auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
 
     // copy entire rows to final dest
@@ -457,7 +466,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
 
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
-          output_data[block.buffer_num] + absolute_row * block_row_size + column_offset;
+          output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset;
       auto const shared_offset = block_row_size * relative_row;
 
       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
@@ -650,7 +659,7 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde
     auto const fetch_block_start_row = fetch_block.start_row;
     auto const fetch_block_end_row = fetch_block.end_row;
     auto const starting_col_offset = col_offsets[fetch_block.start_col];
-    auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes);
+    auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
     auto const num_fetch_cols = fetch_block.num_cols();
     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
         sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
@@ -766,7 +775,7 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
 
     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
 
-    auto block_row_size = block.get_row_size(_col_offsets, _col_sizes);
+    auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes);
 
     // now we copy from shared memory to final destination.
     // the data is laid out in rows in shared memory, so the reads

From fb6dd51fb91d2694735ea9fdd5f86504ef78ebdb Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Wed, 6 Oct 2021 14:43:18 -0700
Subject: [PATCH 56/80] Use the new row<->col method

Added a new method `convertFromRowsFixedWidthOptimized`
and `convertToRowsFixedWidthOptimized` to be used for when columns
are < 100. Otherwise use the new method

This is currently failing simple tests
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 33 +++++++++++
 java/src/main/native/src/TableJni.cpp         | 56 ++++++++++++++++++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 43 +++++++++++++-
 3 files changed, 128 insertions(+), 4 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 68e7a21988a..eb61ec25d9a 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -645,8 +645,12 @@ private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long left
 
   private static native long[] convertToRows(long nativeHandle);
 
+  private static native long[] convertToRowsFixedWidthOptimized(long nativeHandle);
+
   private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale);
 
+  private static native long[] convertFromRowsFixedWidthOptimized(long nativeColumnView, int[] types, int[] scale);
+
   private static native long[] repeatStaticCount(long tableHandle, int count);
 
   private static native long[] repeatColumnCount(long tableHandle,
@@ -2730,6 +2734,15 @@ public ColumnVector[] convertToRows() {
     return ret;
   }
 
+  public ColumnVector[] convertToRowsFixedWidthOptimized() {
+    long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle);
+    ColumnVector[] ret = new ColumnVector[ptrs.length];
+    for (int i = 0; i < ptrs.length; i++) {
+      ret[i] = new ColumnVector(ptrs[i]);
+    }
+    return ret;
+  }
+
   /**
    * Convert a column of list of bytes that is formatted like the output from `convertToRows`
    * and convert it back to a table.
@@ -2750,6 +2763,26 @@ public static Table convertFromRows(ColumnView vec, DType ... schema) {
     return new Table(convertFromRows(vec.getNativeView(), types, scale));
   }
 
+  /**
+   * Convert a column of list of bytes that is formatted like the output from `convertToRows`
+   * and convert it back to a table.
+   * @param vec the row data to process.
+   * @param schema the types of each column.
+   * @return the parsed table.
+   */
+  public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) {
+    // TODO at some point we need a schema that support nesting so we can support nested types
+    // TODO we will need scale at some point very soon too
+    int[] types = new int[schema.length];
+    int[] scale = new int[schema.length];
+    for (int i = 0; i < schema.length; i++) {
+      types[i] = schema[i].typeId.nativeId;
+      scale[i] = schema[i].getScale();
+
+    }
+    return new Table(convertFromRowsFixedWidthOptimized(vec.getNativeView(), types, scale));
+  }
+
   /**
    * Construct a table from a packed representation.
    * @param metadata host-based metadata for the table
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c66cf13a5ae..97fe7b4c71e 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -35,6 +35,7 @@
 #include <cudf/replace.hpp>
 #include <cudf/reshape.hpp>
 #include <cudf/rolling.hpp>
+#include <cudf/row_conversion.hpp>
 #include <cudf/search.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
@@ -604,16 +605,20 @@ class native_arrow_ipc_reader_handle final {
 static jlongArray
 convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
                          std::vector<std::unique_ptr<cudf::column>> &extra_columns) {
+  std::cout << "entering convert_table_for_return\n";
   std::vector<std::unique_ptr<cudf::column>> ret = table_result->release();
   int table_cols = ret.size();
   int num_columns = table_cols + extra_columns.size();
   cudf::jni::native_jlongArray outcol_handles(env, num_columns);
+  std::cout << "0\n";
   for (int i = 0; i < table_cols; i++) {
     outcol_handles[i] = reinterpret_cast<jlong>(ret[i].release());
   }
+  std::cout << "1\n";
   for (size_t i = 0; i < extra_columns.size(); i++) {
     outcol_handles[i + table_cols] = reinterpret_cast<jlong>(extra_columns[i].release());
   }
+  std::cout << "exiting convert_table_for_return\n";
   return outcol_handles.get_jArray();
 }
 
@@ -2688,14 +2693,35 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
+                                                                     jlong input_table) {
+  JNI_NULL_CHECK(env, input_table, "input table is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::old_convert_to_rows(*n_input_table);
+    int num_columns = cols.size();
+    cudf::jni::native_jlongArray outcol_handles(env, num_columns);
+    for (int i = 0; i < num_columns; i++) {
+      outcol_handles[i] = reinterpret_cast<jlong>(cols[i].release());
+    }
+    return outcol_handles.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass,
                                                                      jlong input_table) {
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
 
   try {
+    std::cout << "convert_to_rows\n";
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::java::convert_to_rows(*n_input_table);
+    std::cout << "before convert_to_rows\n";
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::convert_to_rows(*n_input_table);
+    std::cout << "after convert_to_rows\n";
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2706,6 +2732,29 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(JNIEnv *env, jclass,
+                                                                       jlong input_column,
+                                                                       jintArray types,
+                                                                       jintArray scale) {
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  JNI_NULL_CHECK(env, types, "types is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::lists_column_view list_input(*input);
+    cudf::jni::native_jintArray n_types(env, types);
+    cudf::jni::native_jintArray n_scale(env, scale);
+    std::vector<cudf::data_type> types_vec;
+    for (int i = 0; i < n_types.size(); i++) {
+      types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
+    }
+    std::unique_ptr<cudf::table> result = cudf::old_convert_from_rows(list_input, types_vec);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass,
                                                                        jlong input_column,
                                                                        jintArray types,
@@ -2714,6 +2763,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
   JNI_NULL_CHECK(env, types, "types is null", 0);
 
   try {
+    std::cout << "convert_from_rows\n";
     cudf::jni::auto_set_device(env);
     cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
     cudf::lists_column_view list_input(*input);
@@ -2723,7 +2773,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::unique_ptr<cudf::table> result = cudf::java::convert_from_rows(list_input, types_vec);
+    std::cout << "before convert_from_rows\n";
+    std::unique_ptr<cudf::table> result = cudf::convert_from_rows(list_input, types_vec);
+    std::cout << "after convert_from_rows\n";
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 280a4d33ae9..623b444676f 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -51,6 +51,7 @@
 import java.nio.file.Files;
 import java.util.*;
 import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
@@ -7210,6 +7211,44 @@ void testStructColumnFilterStrings() {
     }
   }
 
+  @Test
+  void fixedWidthRowsRoundTripWide() {
+    TestBuilder tb = new TestBuilder();
+    IntStream.range(0, 10).forEach(i -> tb.column(3l, 9l, 4l, 2l, 20l, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(5, 1, 0, 2, 7, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(true, false, false, true, false, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(new Byte[]{2, 3, 4, 5, 9, null}));
+    IntStream.range(0, 10).forEach(i -> tb.decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d,
+        9.5d, 0.9d, 7.23d, 2.8d, null));
+    IntStream.range(0, 10).forEach(i -> tb.decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null));
+    try (Table t = tb.build()) {
+      ColumnVector[] rows = t.convertToRows();
+      try {
+        // We didn't overflow
+        assert rows.length == 1;
+        ColumnVector cv = rows[0];
+        assert cv.getRowCount() == t.getRowCount();
+//        try (HostColumnVector hcv = cv.copyToHost()) {
+//          hcv.getChildColumnView(0).getDataBuffer().printBuffer(8);
+//        }
+
+        DType[] types = new DType[t.getNumberOfColumns()];
+        for (int i = 0; i < t.getNumberOfColumns(); i++) {
+          types[i] = t.getColumn(i).getType();
+        }
+        try (Table backAgain = Table.convertFromRows(cv, types)) {
+          assertTablesAreEqual(t, backAgain);
+        }
+      } finally {
+        for (ColumnVector cv : rows) {
+          cv.close();
+        }
+      }
+    }
+  }
+
   @Test
   void fixedWidthRowsRoundTrip() {
     try (Table t = new TestBuilder()
@@ -7222,7 +7261,7 @@ void fixedWidthRowsRoundTrip() {
         .decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)
         .decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null)
         .build()) {
-      ColumnVector[] rows = t.convertToRows();
+      ColumnVector[] rows = t.convertToRowsFixedWidthOptimized();
       try {
         // We didn't overflow
         assert rows.length == 1;
@@ -7236,7 +7275,7 @@ void fixedWidthRowsRoundTrip() {
         for (int i = 0; i < t.getNumberOfColumns(); i++) {
           types[i] = t.getColumn(i).getType();
         }
-        try (Table backAgain = Table.convertFromRows(cv, types)) {
+        try (Table backAgain = Table.convertFromRowsFixedWidthOptimized(cv, types)) {
           assertTablesAreEqual(t, backAgain);
         }
       } finally {

From b0173bfa0b0006c8def3dde0b659a996b6a6078b Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 13 Oct 2021 22:06:04 +0000
Subject: [PATCH 57/80] fixing bug with float columns when 'enough' data was
 present. Updated function names

---
 .../row_conversion/row_conversion.cpp         |   8 +-
 cpp/include/cudf/row_conversion.hpp           |   4 +-
 cpp/src/row_conversion/row_conversion.cu      |  95 ++++---
 cpp/tests/row_conversion/row_conversion.cpp   | 245 ++++++++++++------
 java/src/main/native/src/TableJni.cpp         |  16 +-
 java/src/main/native/src/row_conversion.cu    |  66 ++---
 java/src/main/native/src/row_conversion.hpp   |  19 +-
 7 files changed, 265 insertions(+), 188 deletions(-)

diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
index 2fe436a22c1..fb8e4c8aef3 100644
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ b/cpp/benchmarks/row_conversion/row_conversion.cpp
@@ -50,7 +50,7 @@ static void BM_old_to_row(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto rows = cudf::old_convert_to_rows(table->view());
+    auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -109,13 +109,13 @@ static void BM_old_from_row(benchmark::State& state)
     total_bytes += cudf::size_of(t);
   }
 
-  auto rows = cudf::old_convert_to_rows(table->view());
+  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
   cudf::lists_column_view const first_list(rows.front()->view());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto out = cudf::old_convert_from_rows(first_list, schema);
+    auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema);
   }
 
   state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
@@ -144,7 +144,7 @@ static void BM_new_from_row(benchmark::State& state)
     total_bytes += cudf::size_of(t);
   }
 
-  auto rows = cudf::old_convert_to_rows(table->view());
+  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
   cudf::lists_column_view const first_list(rows.front()->view());
 
   for (auto _ : state) {
diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
index 8f82d01b06c..5d799f4c596 100644
--- a/cpp/include/cudf/row_conversion.hpp
+++ b/cpp/include/cudf/row_conversion.hpp
@@ -24,7 +24,7 @@
 
 namespace cudf {
 
-std::vector<std::unique_ptr<cudf::column>> old_convert_to_rows(
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
   cudf::table_view const& tbl,
   // TODO need something for validity
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
@@ -36,7 +36,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::table> old_convert_from_rows(
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
   cudf::lists_column_view const& input,
   std::vector<cudf::data_type> const& schema,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 84fab20fce5..0457bbf71e4 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -53,7 +53,7 @@
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS      = 8;
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS    = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS    = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED          = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL        = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
@@ -350,14 +350,6 @@ struct block_info {
              col_offsets[start_col]);
     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
   }
-  __host__ __device__ size_type get_dest_row_size(size_type const* const col_offsets,
-                                                  size_type const* const col_sizes,
-                                                  bool debug_print = false) const
-  {
-    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] +
-                          util::div_rounding_up_unsafe(num_cols(), 8),
-                        8);
-  }
   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
 
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
@@ -441,9 +433,8 @@ __global__ void copy_from_columns(const size_type num_rows,
   //  else { return; }
 
   auto const blocks_remaining =
-    std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS),
-             std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
-                      (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+    std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS,
+             (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS);
 
   size_t fetch;
   size_t subset;
@@ -451,11 +442,11 @@ __global__ void copy_from_columns(const size_type num_rows,
     // Fetch ahead up to stages_count subsets
     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
       if (debug_print)
-        printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch);
-      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
+        printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch);
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch];
       if (debug_print)
         printf("block %lu rows %d-%d and cols %d-%d\n",
-               blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch,
+               blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch,
                fetch_block.start_row,
                fetch_block.end_row,
                fetch_block.start_col,
@@ -474,9 +465,9 @@ __global__ void copy_from_columns(const size_type num_rows,
       // to do the copy we need to do n column copies followed by m element copies OR
       // we have to do m element copies followed by r row copies. When going from column
       // to row it is much easier to copy by elements first otherwise we would need a running
-      // total of the column sizes for our block, which isn't readily available. This makes it more
-      // appealing to copy element-wise from input data into shared matching the end layout and do
-      // row-based memcopies out.
+      // total of the column sizes for our block, which isn't readily available. This makes it
+      // more appealing to copy element-wise from input data into shared matching the end layout
+      // and do row-based memcopies out.
 
       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
@@ -499,14 +490,15 @@ __global__ void copy_from_columns(const size_type num_rows,
         auto const input_src     = input_data[absolute_col] + col_size * absolute_row;
 
         if (debug_print)
-          printf("block %lu to shared chunk %lu. %p <- %p - %d bytes\n",
+          printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n",
                  fetch,
                  fetch % stages_count,
                  &shared[fetch % stages_count][shared_offset],
                  input_src,
+                 *input_src,
                  col_size);
 
-        // copy the element to global memory
+        // copy the element from global memory
         cuda::memcpy_async(
           &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier);
       }
@@ -515,14 +507,11 @@ __global__ void copy_from_columns(const size_type num_rows,
     auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     subset_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
     if (debug_print)
-      printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset);
+      printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset);
 
-    /*    auto const rows_in_block  = block.num_rows();
-        auto const cols_in_block  = block.num_cols();*/
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
-    auto const dest_row_size  = block.get_dest_row_size(col_offsets, col_sizes);
     auto const column_offset  = col_offsets[block.start_col];
 
     // copy entire rows to final dest
@@ -530,7 +519,7 @@ __global__ void copy_from_columns(const size_type num_rows,
          absolute_row += blockDim.x) {
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
-        output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset;
+        output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset;
       if (debug_print)
         printf("processing row %d\noutput data[%d] is address %p\n",
                absolute_row,
@@ -543,6 +532,7 @@ __global__ void copy_from_columns(const size_type num_rows,
                &shared[subset % stages_count][shared_offset],
                block_row_size,
                absolute_row);
+
       cuda::memcpy_async(
         output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier);
     }
@@ -673,7 +663,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
 
     if (print_debug)
       printf(
-        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, "
+        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, "
+        "blockDim.x=%d, "
         "warp size "
         "%d\n",
         threadIdx.x,
@@ -709,7 +700,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
 
       if (print_debug)
         printf(
-          "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d absolute)\n",
+          "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d "
+          "absolute)\n",
           participation_mask,
           relative_row,
           absolute_row,
@@ -744,8 +736,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             absolute_col);
 
         // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make
-        // the bytes we actually write.
+        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
+        // make the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
@@ -915,7 +907,8 @@ static __device__ void fetch_blocks_for_row_to_column(
       block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
     if (debug_print)
       printf(
-        "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending "
+        "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, "
+        "ending "
         "offset %p\n",
         blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
         total_blocks,
@@ -1242,7 +1235,8 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
            block_infos,
            blockIdx.x);
     printf(
-      "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, row "
+      "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, "
+      "row "
       "offsets are %p, block infos at %p\n",
       threadIdx.x,
       blockIdx.x,
@@ -1595,8 +1589,8 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   }
 
   // Now we need to add in space for validity
-  // Eventually we can think about nullable vs not nullable, but for now we will just always add it
-  // in
+  // Eventually we can think about nullable vs not nullable, but for now we will just always add
+  // it in
   int32_t validity_bytes_needed =
     (schema.size() + 7) / 8;  // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
   // validity comes at the end and is byte aligned so we can pack more in.
@@ -1727,11 +1721,11 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
   };
 
   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-  // would be memory cache line sized access, but since other blocks will read/write the edges this
-  // may not turn out to be overly important. For now, we will attempt to build a square window as
-  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
-  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
-  // bytes, not rows or columns.
+  // would be memory cache line sized access, but since other blocks will read/write the edges
+  // this may not turn out to be overly important. For now, we will attempt to build a square
+  // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
+  // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
+  // trick is that it's in bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
   int const window_height            = std::clamp(
     util::round_up_safe<int>(
@@ -1787,9 +1781,11 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
           calc_admin_data_size(col - current_window_start_col),
         shmem_limit_per_block);
       printf(
-        "Window size %d too large at column %d, admin size is %d, bumping back to build windows of "
+        "Window size %d too large at column %d, admin size is %d, bumping back to build windows "
+        "of "
         "size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) "
+        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is "
+        "%d) "
         "for shared mem size %d\n",
         row_size_with_end_pad * window_height,
         col,
@@ -1809,7 +1805,8 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const& column_s
         detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
 #if defined(DEBUG)
       printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d "
+        "New window starting with offset %d and row size %d to be %d (previous column offset "
+        "%d+%d "
         "or %d)\n",
         row_size,
         col_size,
@@ -2172,9 +2169,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 #endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::vector<std::unique_ptr<cudf::column>> old_convert_to_rows(cudf::table_view const& tbl,
-                                                               rmm::cuda_stream_view stream,
-                                                               rmm::mr::device_memory_resource* mr)
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
+  cudf::table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   const cudf::size_type num_columns = tbl.num_columns();
 
@@ -2399,10 +2395,11 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& in
 #endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::unique_ptr<cudf::table> old_convert_from_rows(cudf::lists_column_view const& input,
-                                                   std::vector<cudf::data_type> const& schema,
-                                                   rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+  cudf::lists_column_view const& input,
+  std::vector<cudf::data_type> const& schema,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   // verify that the types are what we expect
   cudf::column_view child = input.child();
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 0ab8b70a0f7..746ac0655f7 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -14,15 +14,21 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/row_conversion.hpp>
+#include <cudf/types.hpp>
+#include <cudf/wrappers/timestamps.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
-#include <cudf/row_conversion.hpp>
-#include "cudf/lists/lists_column_view.hpp"
-#include "cudf/types.hpp"
+#include <thrust/iterator/transform_iterator.h>
+
+#include <limits>
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {
 };
@@ -35,20 +41,17 @@ TEST_F(ColumnToRowTests, Single)
   cudf::table_view in(std::vector<cudf::column_view>{a});
   std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Simple)
@@ -57,20 +60,17 @@ TEST_F(ColumnToRowTests, Simple)
   cudf::table_view in(std::vector<cudf::column_view>{a});
   std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Tall)
@@ -81,21 +81,18 @@ TEST_F(ColumnToRowTests, Tall)
   cudf::table_view in(std::vector<cudf::column_view>{a});
   std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Wide)
@@ -111,20 +108,17 @@ TEST_F(ColumnToRowTests, Wide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, SingleByteWide)
@@ -141,21 +135,18 @@ TEST_F(ColumnToRowTests, SingleByteWide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Non2Power)
@@ -175,13 +166,14 @@ TEST_F(ColumnToRowTests, Non2Power)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
@@ -190,10 +182,6 @@ TEST_F(ColumnToRowTests, Non2Power)
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Big)
@@ -214,13 +202,14 @@ TEST_F(ColumnToRowTests, Big)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
@@ -229,10 +218,6 @@ TEST_F(ColumnToRowTests, Big)
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Bigger)
@@ -253,12 +238,13 @@ TEST_F(ColumnToRowTests, Bigger)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
@@ -267,10 +253,6 @@ TEST_F(ColumnToRowTests, Bigger)
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(ColumnToRowTests, Biggest)
@@ -291,13 +273,14 @@ TEST_F(ColumnToRowTests, Biggest)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
@@ -306,9 +289,6 @@ TEST_F(ColumnToRowTests, Biggest)
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
-  /*  for (uint i = 0; i < old_rows.size(); i++) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]);
-    }*/
 }
 
 TEST_F(RowToColumnTests, Single)
@@ -319,7 +299,8 @@ TEST_F(RowToColumnTests, Single)
   auto old_rows = cudf::convert_to_rows(in);
   std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -331,10 +312,11 @@ TEST_F(RowToColumnTests, Simple)
   cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -348,14 +330,15 @@ TEST_F(RowToColumnTests, Tall)
   cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
   cudf::table_view in(std::vector<cudf::column_view>{a});
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -373,7 +356,7 @@ TEST_F(RowToColumnTests, Wide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
@@ -381,7 +364,8 @@ TEST_F(RowToColumnTests, Wide)
   }
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -399,21 +383,22 @@ TEST_F(RowToColumnTests, SingleByteWide)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   std::vector<cudf::data_type> schema;
   schema.reserve(in.num_columns());
   for (auto col = in.begin(); col < in.end(); ++col) {
     schema.push_back(col->type());
   }
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
 }
 
-TEST_F(RowToColumnTests, Raza)
+TEST_F(RowToColumnTests, AllTypes)
 {
   std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
   std::vector<cudf::column_view> views;
@@ -442,11 +427,115 @@ TEST_F(RowToColumnTests, Raza)
 
   cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7});
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
+  auto new_rows = cudf::convert_to_rows(in);
+
+  for (uint i = 0; i < old_rows.size(); ++i) {
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
+  }
+}
+
+TEST_F(RowToColumnTests, AllTypesLarge)
+{
+  std::vector<cudf::column> cols;
+  std::vector<cudf::data_type> schema{};
+
+  // 10 columns of each type with 1024 entries
+  constexpr int num_rows{1024};
+
+  std::default_random_engine re;
+  std::uniform_real_distribution<double> rand_double(std::numeric_limits<double>::min(),
+                                                     std::numeric_limits<double>::max());
+  std::uniform_int_distribution<int64_t> rand_int64(std::numeric_limits<int64_t>::min(),
+                                                    std::numeric_limits<int64_t>::max());
+  auto r = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) -> int64_t { return rand_int64(re); });
+  auto d = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) -> double { return rand_double(re); });
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::INT8});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::INT16});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::INT32});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::FLOAT32});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::FLOAT64});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows).release().release());
+    schema.push_back(cudf::data_type{cudf::type_id::BOOL8});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+         r, r + num_rows)
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>(
+         r, r + num_rows)
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_point_column_wrapper<int32_t>(r, r + num_rows, numeric::scale_type{-2})
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32});
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    cols.push_back(
+      *cudf::test::fixed_point_column_wrapper<int64_t>(r, r + num_rows, numeric::scale_type{-1})
+         .release()
+         .release());
+    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64});
+  }
+
+  std::vector<cudf::column_view> views(cols.begin(), cols.end());
+  cudf::table_view in(views);
+
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
   auto new_rows = cudf::convert_to_rows(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -470,10 +559,11 @@ TEST_F(RowToColumnTests, Non2Power)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -498,10 +588,11 @@ TEST_F(RowToColumnTests, Big)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -526,10 +617,11 @@ TEST_F(RowToColumnTests, Bigger)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
@@ -554,10 +646,11 @@ TEST_F(RowToColumnTests, Biggest)
   }
   cudf::table_view in(views);
 
-  auto old_rows = cudf::old_convert_to_rows(in);
+  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
 
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto old_tbl =
+      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
     auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 97fe7b4c71e..76b249d591b 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2693,14 +2693,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
-                                                                     jlong input_table) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, jlong input_table) {
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::old_convert_to_rows(*n_input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols =
+        cudf::convert_to_rows_fixed_width_optimized(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2732,10 +2733,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(JNIEnv *env, jclass,
-                                                                       jlong input_column,
-                                                                       jintArray types,
-                                                                       jintArray scale) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(
+    JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   JNI_NULL_CHECK(env, types, "types is null", 0);
 
@@ -2749,7 +2748,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::unique_ptr<cudf::table> result = cudf::old_convert_from_rows(list_input, types_vec);
+    std::unique_ptr<cudf::table> result =
+        cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 1808c7534df..e6cd9a9da32 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -21,8 +21,6 @@
 #include <tuple>
 
 #include <cooperative_groups.h>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
@@ -30,10 +28,12 @@
 #endif
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
@@ -51,7 +51,7 @@
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8;
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
@@ -331,17 +331,9 @@ struct block_info {
   int buffer_num;
 
   __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
-                                                    size_type const *const col_sizes,
-                                                    bool debug_print = false) const {
+                                                    size_type const *const col_sizes) const {
     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
   }
-  __host__ __device__ size_type get_dest_row_size(size_type const *const col_offsets,
-                                                  size_type const *const col_sizes,
-                                                  bool debug_print = false) const {
-    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] +
-                            util::div_rounding_up_unsafe(num_cols(), 8),
-                        8);
-  }
   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
 
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
@@ -404,16 +396,15 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
   group.sync();
 
   auto const blocks_remaining =
-      std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS),
-               std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
-                        (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+      std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS,
+               (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS);
 
   size_t fetch;
   size_t subset;
   for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
     // Fetch ahead up to stages_count subsets
     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
-      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch];
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch];
       auto const num_fetch_cols = fetch_block.num_cols();
       auto const num_fetch_rows = fetch_block.num_rows();
       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
@@ -429,9 +420,9 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
       // to do the copy we need to do n column copies followed by m element copies OR
       // we have to do m element copies followed by r row copies. When going from column
       // to row it is much easier to copy by elements first otherwise we would need a running
-      // total of the column sizes for our block, which isn't readily available. This makes it more
-      // appealing to copy element-wise from input data into shared matching the end layout and do
-      // row-based memcopies out.
+      // total of the column sizes for our block, which isn't readily available. This makes it
+      // more appealing to copy element-wise from input data into shared matching the end layout
+      // and do row-based memcopies out.
 
       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
@@ -445,7 +436,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
         auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
 
-        // copy the element to global memory
+        // copy the element from global memory
         cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size,
                            fetch_barrier);
       }
@@ -454,10 +445,8 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     subset_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
-    auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
 
     // copy entire rows to final dest
@@ -466,7 +455,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
 
       auto const relative_row = absolute_row - block.start_row;
       auto const output_dest =
-          output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset;
+          output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset;
       auto const shared_offset = block_row_size * relative_row;
 
       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
@@ -563,8 +552,8 @@ __global__ void copy_validity_from_columns(
             input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
 
         // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make
-        // the bytes we actually write.
+        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
+        // make the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
@@ -1085,8 +1074,8 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   }
 
   // Now we need to add in space for validity
-  // Eventually we can think about nullable vs not nullable, but for now we will just always add it
-  // in
+  // Eventually we can think about nullable vs not nullable, but for now we will just always add
+  // it in
   int32_t validity_bytes_needed =
       (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
   // validity comes at the end and is byte aligned so we can pack more in.
@@ -1209,11 +1198,11 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
   };
 
   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-  // would be memory cache line sized access, but since other blocks will read/write the edges this
-  // may not turn out to be overly important. For now, we will attempt to build a square window as
-  // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we
-  // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in
-  // bytes, not rows or columns.
+  // would be memory cache line sized access, but since other blocks will read/write the edges
+  // this may not turn out to be overly important. For now, we will attempt to build a square
+  // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
+  // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
+  // trick is that it's in bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
   int const window_height = std::clamp(
       util::round_up_safe<int>(
@@ -1478,8 +1467,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 }
 
 std::vector<std::unique_ptr<cudf::column>>
-old_convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource *mr) {
+convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource *mr) {
   const cudf::size_type num_columns = tbl.num_columns();
 
   std::vector<cudf::data_type> schema;
@@ -1656,10 +1645,9 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::unique_ptr<cudf::table> old_convert_from_rows(cudf::lists_column_view const &input,
-                                                   std::vector<cudf::data_type> const &schema,
-                                                   rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource *mr) {
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+    cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
+    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   // verify that the types are what we expect
   cudf::column_view child = input.child();
   cudf::type_id list_type = child.type().id();
diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
index 517202f3892..edc2768d4bb 100644
--- a/java/src/main/native/src/row_conversion.hpp
+++ b/java/src/main/native/src/row_conversion.hpp
@@ -25,11 +25,11 @@
 namespace cudf {
 namespace java {
 
-std::vector<std::unique_ptr<cudf::column>>
-old_convert_to_rows(cudf::table_view const &tbl,
-                    // TODO need something for validity
-                    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
-                    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
+    cudf::table_view const &tbl,
+    // TODO need something for validity
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 std::vector<std::unique_ptr<cudf::column>>
 convert_to_rows(cudf::table_view const &tbl,
@@ -37,11 +37,10 @@ convert_to_rows(cudf::table_view const &tbl,
                 rmm::cuda_stream_view stream = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::table>
-old_convert_from_rows(cudf::lists_column_view const &input,
-                      std::vector<cudf::data_type> const &schema,
-                      rmm::cuda_stream_view stream = rmm::cuda_stream_default,
-                      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+    cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table>
 convert_from_rows(cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,

From 81cbaa60c6ae5dfa40ccaea821de575d7fd19d9e Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Fri, 15 Oct 2021 15:20:52 -0700
Subject: [PATCH 58/80] code cleanup and removed comments

---
 java/src/main/native/src/TableJni.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 76b249d591b..45403f1eb0d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -605,20 +605,16 @@ class native_arrow_ipc_reader_handle final {
 static jlongArray
 convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
                          std::vector<std::unique_ptr<cudf::column>> &extra_columns) {
-  std::cout << "entering convert_table_for_return\n";
   std::vector<std::unique_ptr<cudf::column>> ret = table_result->release();
   int table_cols = ret.size();
   int num_columns = table_cols + extra_columns.size();
   cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-  std::cout << "0\n";
   for (int i = 0; i < table_cols; i++) {
     outcol_handles[i] = reinterpret_cast<jlong>(ret[i].release());
   }
-  std::cout << "1\n";
   for (size_t i = 0; i < extra_columns.size(); i++) {
     outcol_handles[i + table_cols] = reinterpret_cast<jlong>(extra_columns[i].release());
   }
-  std::cout << "exiting convert_table_for_return\n";
   return outcol_handles.get_jArray();
 }
 
@@ -2717,12 +2713,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
 
   try {
-    std::cout << "convert_to_rows\n";
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::cout << "before convert_to_rows\n";
     std::vector<std::unique_ptr<cudf::column>> cols = cudf::convert_to_rows(*n_input_table);
-    std::cout << "after convert_to_rows\n";
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2763,7 +2756,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
   JNI_NULL_CHECK(env, types, "types is null", 0);
 
   try {
-    std::cout << "convert_from_rows\n";
     cudf::jni::auto_set_device(env);
     cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
     cudf::lists_column_view list_input(*input);
@@ -2773,9 +2765,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::cout << "before convert_from_rows\n";
     std::unique_ptr<cudf::table> result = cudf::convert_from_rows(list_input, types_vec);
-    std::cout << "after convert_from_rows\n";
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);

From 58eb43f7e1e23baa68fe6813a4200cb54a0321b2 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 21 Oct 2021 00:53:01 +0000
Subject: [PATCH 59/80] Fixing validity buffer alignment issue for row data

---
 cpp/src/row_conversion/row_conversion.cu    | 142 ++++++++++++--------
 cpp/tests/row_conversion/row_conversion.cpp |  63 ++++++---
 java/src/main/native/src/row_conversion.cu  |  58 +++++---
 3 files changed, 165 insertions(+), 98 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 0457bbf71e4..90bd8b88ef0 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -469,6 +469,7 @@ __global__ void copy_from_columns(const size_type num_rows,
       // more appealing to copy element-wise from input data into shared matching the end layout
       // and do row-based memcopies out.
 
+      auto const shared_buffer_base = shared[fetch % stages_count];
       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
         auto const relative_row = el % num_fetch_rows;
@@ -493,14 +494,36 @@ __global__ void copy_from_columns(const size_type num_rows,
           printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n",
                  fetch,
                  fetch % stages_count,
-                 &shared[fetch % stages_count][shared_offset],
+                 &shared_buffer_base[shared_offset],
                  input_src,
                  *input_src,
                  col_size);
 
         // copy the element from global memory
-        cuda::memcpy_async(
-          &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier);
+        switch (col_size) {
+          case 2:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset],
+                               input_src,
+                               cuda::aligned_size_t<2>(col_size),
+                               fetch_barrier);
+            break;
+          case 4:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset],
+                               input_src,
+                               cuda::aligned_size_t<4>(col_size),
+                               fetch_barrier);
+            break;
+          case 8:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset],
+                               input_src,
+                               cuda::aligned_size_t<8>(col_size),
+                               fetch_barrier);
+            break;
+          default:
+            cuda::memcpy_async(
+              &shared_buffer_base[shared_offset], input_src, col_size, fetch_barrier);
+            break;
+        }
       }
     }
 
@@ -511,15 +534,15 @@ __global__ void copy_from_columns(const size_type num_rows,
     if (debug_print)
       printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset);
 
-    auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
-    auto const column_offset  = col_offsets[block.start_col];
+    auto const block_row_size      = block.get_shared_row_size(col_offsets, col_sizes);
+    auto const column_offset       = col_offsets[block.start_col];
+    auto const block_output_buffer = output_data[block.buffer_num];
 
     // copy entire rows to final dest
     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
          absolute_row += blockDim.x) {
       auto const relative_row = absolute_row - block.start_row;
-      auto const output_dest =
-        output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset;
+      auto const output_dest  = block_output_buffer + row_offsets[absolute_row] + column_offset;
       if (debug_print)
         printf("processing row %d\noutput data[%d] is address %p\n",
                absolute_row,
@@ -533,8 +556,10 @@ __global__ void copy_from_columns(const size_type num_rows,
                block_row_size,
                absolute_row);
 
-      cuda::memcpy_async(
-        output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier);
+      cuda::memcpy_async(output_dest,
+                         &shared[subset % stages_count][shared_offset],
+                         cuda::aligned_size_t<8>(block_row_size),
+                         subset_barrier);
     }
   }
 
@@ -641,8 +666,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
     auto const num_block_cols = block.num_cols();
     auto const num_block_rows = block.num_rows();
 
-    auto const num_sections_x = (num_block_cols + 31) / 32;
-    auto const num_sections_y = (num_block_rows + 7) / 8;
+    auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
+    auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
     auto const validity_data_row_length =
       align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
@@ -690,7 +715,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
                my_section_idx,
                total_sections);
       auto const relative_col = section_x * 32 + lane_id;
-      auto const relative_row = section_y * 8;
+      auto const relative_row = section_y * 32;
       auto const absolute_col = relative_col + block.start_col;
       auto const absolute_row = relative_row + block.start_row;
       auto const cols_left    = num_columns - absolute_col;
@@ -720,15 +745,15 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             absolute_row,
             relative_col,
             absolute_col);
-        auto my_byte =
-          input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
+        auto my_data = input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32]
+                                                         : std::numeric_limits<uint32_t>::max();
 
         if (print_debug)
           printf(
-            "thread %d's byte is 0x%x, participation mask is 0x%x for relative row %d(%d real), "
+            "thread %d's bytes are 0x%x, participation mask is 0x%x for relative row %d(%d real), "
             "relative col %d(%d absolute)\n",
             threadIdx.x,
-            my_byte & 0xFF,
+            my_data,
             participation_mask,
             relative_row,
             absolute_row,
@@ -738,8 +763,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
         // every thread that is participating in the warp has a byte, but it's column-based
         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
         // make the bytes we actually write.
-        for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
-          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+        bitmask_type dw_mask = 1;
+        for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
           // lead thread in each warp writes data
           auto const validity_write_offset =
             validity_data_row_length * (relative_row + i) + relative_col / 8;
@@ -750,8 +776,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
                 "0x%x\n",
                 threadIdx.x,
                 blockIdx.x,
-                byte_mask,
-                my_byte & byte_mask,
+                dw_mask,
+                my_data & dw_mask,
                 validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
                 validity_write_offset,
                 validity_data);
@@ -804,6 +830,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
     // make sure entire block has finished copy
     group.sync();
 
+    auto const output_data_base =
+      output_data[block.buffer_num] + validity_offset + block.start_col / 8;
+
     // now async memcpy the shared memory out to the final destination
     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
       auto const relative_row = row - block.start_row;
@@ -835,9 +864,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows,
             word_index(block.start_col),
           this_shared_block[validity_data_row_length * relative_row]);
       }
-      auto const output_ptr =
-        output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
-      auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+      auto const output_ptr = output_data_base + row_offsets[row];
+      auto const num_bytes  = util::div_rounding_up_unsafe(num_block_cols, 8);
 
       cuda::memcpy_async(
         output_ptr,
@@ -970,11 +998,20 @@ static __device__ void fetch_blocks_for_row_to_column(
          row += blockDim.x) {
       auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
       if (debug_print)
-        printf("fetching block %lu to shared chunk %lu. %p <- %p\n",
-               fetch_index,
-               fetch_index % max_resident_blocks,
-               &shared[fetch_index % max_resident_blocks][shared_offset],
-               &input_data[row_offsets[row] + starting_col_offset]);
+        printf(
+          "%d - fetching block %lu to shared chunk %lu. %p(shared[%d %% %d][%d]) <- %p(row %d, row "
+          "offset %d starting col offset %d)\n",
+          threadIdx.x,
+          fetch_index,
+          fetch_index % max_resident_blocks,
+          &shared[fetch_index % max_resident_blocks][shared_offset],
+          (int)fetch_index,
+          max_resident_blocks,
+          shared_offset,
+          &input_data[row_offsets[row] + starting_col_offset],
+          row,
+          row_offsets[row],
+          starting_col_offset);
       // copy the main
       cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
                          &input_data[row_offsets[row] + starting_col_offset],
@@ -1021,7 +1058,7 @@ __global__ void copy_to_columns(const size_type num_rows,
   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
   // to shared memory for each of the blocks that we work on
 
-  /*constexpr*/ bool debug_print  = false;  // threadIdx.x == 0 && blockIdx.x == 0;
+  constexpr bool debug_print      = false;  // threadIdx.x == 2 && blockIdx.x == 0;
   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
   auto group                      = cooperative_groups::this_thread_block();
   extern __shared__ int8_t shared_data[];
@@ -1094,12 +1131,12 @@ __global__ void copy_to_columns(const size_type num_rows,
 
     auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     // ensure our data is ready
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+    if (debug_print)
       printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
     subset_barrier.arrive_and_wait();
 
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+    if (debug_print)
       printf("%d-%d reading block %lu at address %p\n",
              threadIdx.x,
              blockIdx.x,
@@ -1159,19 +1196,19 @@ __global__ void copy_to_columns(const size_type num_rows,
 
       if (debug_print) {
         printf(
-          "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, "
-          "shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
-          " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n",
-          relative_col,
-          relative_row,
-          absolute_col,
-          absolute_row,
-          shared_memory_row_offset,
-          shared_memory_offset,
-          column_size,
-          shmem_src,
-          dst/*,
-          *reinterpret_cast<uint32_t*>(shmem_src)*/);
+           "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, "
+           "shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
+           " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n",
+           relative_col,
+           relative_row,
+           absolute_col,
+           absolute_row,
+           shared_memory_row_offset,
+           shared_memory_offset,
+           column_size,
+           shmem_src,
+           dst/*,
+           *reinterpret_cast<uint32_t*>(shmem_src)*/);
         printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size);
       }
       if (debug_print && absolute_col == 0 && absolute_row == 51) {
@@ -1185,7 +1222,7 @@ __global__ void copy_to_columns(const size_type num_rows,
       cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
     }
     group.sync();
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
+    if (debug_print)
       printf(
         "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
   }
@@ -1224,9 +1261,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
   int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
     shared_data, shared_data + shmem_used_per_block / 2};
 
-  bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
-  // bool print_debug = false;
-  //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
+  constexpr bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
   if (print_debug) {
     printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
     printf("%d %d - block infos are at %p and my index is %d\n",
@@ -1246,10 +1281,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       output_nm,
       row_offsets,
       block_infos);
-    /*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-    printf("%d: %d\n", i, row_offsets[i]);
-    }*/
   }
   // else { return; }
 
@@ -1407,14 +1438,15 @@ __global__ void copy_validity_to_columns(const size_type num_rows,
       auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       if (print_debug)
-        printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n",
+        printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier); - 0x%x\n",
                threadIdx.x,
                blockIdx.x,
                col,
                starting_address,
                word_index(block_start_row),
                &this_shared_block[validity_data_col_length * relative_col],
-               words_to_copy * 4);
+               words_to_copy * 4,
+               this_shared_block[validity_data_col_length * relative_col]);
       cuda::memcpy_async(
         output_nm[col] + word_index(block_start_row),
         &this_shared_block[validity_data_col_length * relative_col],
@@ -1627,7 +1659,7 @@ static size_type compute_column_information(iterator begin,
     fixed_width_size_per_row += col_size;
   }
 
-  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  auto validity_offset = fixed_width_size_per_row;
   column_starts.push_back(validity_offset);
 
   return fixed_width_size_per_row;
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
index 746ac0655f7..b807b5cec81 100644
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ b/cpp/tests/row_conversion/row_conversion.cpp
@@ -457,46 +457,65 @@ TEST_F(RowToColumnTests, AllTypesLarge)
   auto d = cudf::detail::make_counting_transform_iterator(
     0, [&](auto i) -> double { return rand_double(re); });
 
+  auto all_valid  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+  auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
+  auto most_valid = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; });
+  auto few_valid = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; });
+
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows, all_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::INT8});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows, few_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::INT16});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows).release().release());
+    if (i < 5) {
+      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, few_valid)
+                        .release()
+                        .release());
+    } else {
+      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, none_valid)
+                        .release()
+                        .release());
+    }
     schema.push_back(cudf::data_type{cudf::type_id::INT32});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows, most_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::FLOAT32});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows, most_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::FLOAT64});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows).release().release());
+    cols.push_back(*cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows, few_valid)
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::BOOL8});
   }
 
   for (int i = 0; i < 10; ++i) {
     cols.push_back(
       *cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
-         r, r + num_rows)
+         r, r + num_rows, all_valid)
          .release()
          .release());
     schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
@@ -505,25 +524,25 @@ TEST_F(RowToColumnTests, AllTypesLarge)
   for (int i = 0; i < 10; ++i) {
     cols.push_back(
       *cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>(
-         r, r + num_rows)
+         r, r + num_rows, most_valid)
          .release()
          .release());
     schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_point_column_wrapper<int32_t>(r, r + num_rows, numeric::scale_type{-2})
-         .release()
-         .release());
+    cols.push_back(*cudf::test::fixed_point_column_wrapper<int32_t>(
+                      r, r + num_rows, all_valid, numeric::scale_type{-2})
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32});
   }
 
   for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_point_column_wrapper<int64_t>(r, r + num_rows, numeric::scale_type{-1})
-         .release()
-         .release());
+    cols.push_back(*cudf::test::fixed_point_column_wrapper<int64_t>(
+                      r, r + num_rows, most_valid, numeric::scale_type{-1})
+                      .release()
+                      .release());
     schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64});
   }
 
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index e6cd9a9da32..a67589fbaec 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -424,6 +424,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
       // more appealing to copy element-wise from input data into shared matching the end layout
       // and do row-based memcopies out.
 
+      auto const shared_buffer_base = shared[fetch % stages_count];
       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
         auto const relative_row = el % num_fetch_rows;
@@ -437,8 +438,24 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
 
         // copy the element from global memory
-        cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size,
-                           fetch_barrier);
+        switch (col_size) {
+          case 2:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                               cuda::aligned_size_t<2>(col_size), fetch_barrier);
+            break;
+          case 4:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                               cuda::aligned_size_t<4>(col_size), fetch_barrier);
+            break;
+          case 8:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                               cuda::aligned_size_t<8>(col_size), fetch_barrier);
+            break;
+          default:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size,
+                               fetch_barrier);
+            break;
+        }
       }
     }
 
@@ -448,18 +465,17 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
+    auto const block_output_buffer = output_data[block.buffer_num];
 
     // copy entire rows to final dest
     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
          absolute_row += blockDim.x) {
-
       auto const relative_row = absolute_row - block.start_row;
-      auto const output_dest =
-          output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset;
+      auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset;
       auto const shared_offset = block_row_size * relative_row;
 
-      cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size,
-                         subset_barrier);
+      cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset],
+                         cuda::aligned_size_t<8>(block_row_size), subset_barrier);
     }
   }
 
@@ -523,8 +539,8 @@ __global__ void copy_validity_from_columns(
     auto const num_block_cols = block.num_cols();
     auto const num_block_rows = block.num_rows();
 
-    auto const num_sections_x = (num_block_cols + 31) / 32;
-    auto const num_sections_y = (num_block_rows + 7) / 8;
+    auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
+    auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
     auto const validity_data_row_length =
         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
@@ -536,26 +552,27 @@ __global__ void copy_validity_from_columns(
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
          my_section_idx += warps_per_block) {
-
       // convert to rows and cols
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
       auto const relative_col = section_x * 32 + lane_id;
-      auto const relative_row = section_y * 8;
+      auto const relative_row = section_y * 32;
       auto const absolute_col = relative_col + block.start_col;
       auto const absolute_row = relative_row + block.start_row;
       auto const cols_left = num_columns - absolute_col;
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
 
       if (absolute_col < num_columns) {
-        auto my_byte =
-            input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF;
+        auto my_data = input_nm[absolute_col] != nullptr ?
+                           input_nm[absolute_col][absolute_row / 32] :
+                           std::numeric_limits<uint32_t>::max();
 
         // every thread that is participating in the warp has a byte, but it's column-based
         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
         // make the bytes we actually write.
-        for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) {
-          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+        bitmask_type dw_mask = 1;
+        for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
           // lead thread in each warp writes data
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
@@ -585,11 +602,13 @@ __global__ void copy_validity_from_columns(
     // make sure entire block has finished copy
     group.sync();
 
+    auto const output_data_base =
+        output_data[block.buffer_num] + validity_offset + block.start_col / 8;
+
     // now async memcpy the shared memory out to the final destination
     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
       auto const relative_row = row - block.start_row;
-      auto const output_ptr =
-          output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8;
+      auto const output_ptr = output_data_base + row_offsets[row];
       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
 
       cuda::memcpy_async(
@@ -917,8 +936,6 @@ __global__ void copy_validity_to_columns(
     // now async memcpy the shared
     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
       auto const relative_col = col - block.start_col;
-      auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32);
-      auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       cuda::memcpy_async(
           output_nm[col] + word_index(block_start_row),
@@ -1111,7 +1128,7 @@ static size_type compute_column_information(iterator begin, iterator end,
     fixed_width_size_per_row += col_size;
   }
 
-  auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4);
+  auto validity_offset = fixed_width_size_per_row;
   column_starts.push_back(validity_offset);
 
   return fixed_width_size_per_row;
@@ -1233,7 +1250,6 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
     if (row_size_with_end_pad * window_height +
             calc_admin_data_size(col - current_window_start_col) >
         shmem_limit_per_block) {
-
       // too large, close this window, generate vertical blocks and restart
       build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
       row_size =

From 06837f061795c6bc09b530e42d6dd14cbcf1af5f Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 21 Oct 2021 18:02:07 +0000
Subject: [PATCH 60/80] Cleaning up code for PR

---
 cpp/src/row_conversion/row_conversion.cu   | 4132 ++++++++------------
 java/src/main/native/src/row_conversion.cu |  237 +-
 2 files changed, 1740 insertions(+), 2629 deletions(-)

diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
index 90bd8b88ef0..c068a2c0b76 100644
--- a/cpp/src/row_conversion/row_conversion.cu
+++ b/cpp/src/row_conversion/row_conversion.cu
@@ -14,2487 +14,1653 @@
  * limitations under the License.
  */
 
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <tuple>
-#include <type_traits>
-#include "cudf/detail/iterator.cuh"
-#include "cudf/lists/lists_column_device_view.cuh"
-
-#include <cooperative_groups.h>
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-#include <cuda/barrier>
-#endif
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/sequence.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/row_conversion.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS      = 8;
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS    = 2;
-constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED          = 2;
-constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL        = 8;
-constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
-#endif
-
-using cudf::detail::make_device_uvector_async;
-using rmm::device_uvector;
-namespace cudf {
-
-namespace detail {
-
-static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment)
-{
-  return (offset + alignment - 1) & ~(alignment - 1);
-}
-
-__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
-                                            const cudf::size_type num_columns,
-                                            const cudf::size_type row_size,
-                                            const cudf::size_type* input_offset_in_row,
-                                            const cudf::size_type* num_bytes,
-                                            int8_t** output_data,
-                                            cudf::bitmask_type** output_nm,
-                                            const int8_t* input_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // For simplicity we will refer to this as a row_group
-
-  // In practice we have found writing more than 4 columns of data per thread
-  // results in performance loss. As such we are using a 2 dimensional
-  // kernel in terms of threads, but not in terms of blocks. Columns are
-  // controlled by the y dimension (there is no y dimension in blocks). Rows
-  // are controlled by the x dimension (there are multiple blocks in the x
-  // dimension).
-
-  cudf::size_type rows_per_group   = blockDim.x;
-  cudf::size_type row_group_start  = blockIdx.x;
-  cudf::size_type row_group_stride = gridDim.x;
-  cudf::size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
-
-  extern __shared__ int8_t shared_data[];
-
-  // Because we are copying fixed width only data and we stride the rows
-  // this thread will always start copying from shared data in the same place
-  int8_t* row_tmp     = &shared_data[row_size * threadIdx.x];
-  int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
-
-  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
-       row_group_index += row_group_stride) {
-    // Step 1: Copy the data into shared memory
-    // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t* long_shared      = reinterpret_cast<int64_t*>(shared_data);
-    const int64_t* long_input = reinterpret_cast<int64_t const*>(input_data);
-
-    cudf::size_type shared_output_index  = threadIdx.x + (threadIdx.y * blockDim.x);
-    cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
-    cudf::size_type row_index_end        = ((row_group_index + 1) * rows_per_group);
-    if (row_index_end > num_rows) { row_index_end = num_rows; }
-    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    cudf::size_type shared_length     = row_size * num_rows_in_group;
-
-    cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
-
-    cudf::size_type start_input_index =
-      (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
-
-    for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
-         shared_index += shared_output_stride) {
-      long_shared[shared_index] = long_input[start_input_index + shared_index];
-    }
-    // Wait for all of the data to be in shared memory
-    __syncthreads();
-
-    // Step 2 copy the data back out
-
-    // Within the row group there should be 1 thread for each row.  This is a
-    // requirement for launching the kernel
-    cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
-    // But we might not use all of the threads if the number of rows does not go
-    // evenly into the thread count. We don't want those threads to exit yet
-    // because we may need them to copy data in for the next row group.
-    uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
-    if (row_index < num_rows) {
-      cudf::size_type col_index_start  = threadIdx.y;
-      cudf::size_type col_index_stride = blockDim.y;
-      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
-           col_index += col_index_stride) {
-        cudf::size_type col_size = num_bytes[col_index];
-        const int8_t* col_tmp    = &(row_tmp[input_offset_in_row[col_index]]);
-        int8_t* col_output       = output_data[col_index];
-        switch (col_size) {
-          case 1: {
-            col_output[row_index] = *col_tmp;
-            break;
-          }
-          case 2: {
-            int16_t* short_col_output   = reinterpret_cast<int16_t*>(col_output);
-            short_col_output[row_index] = *reinterpret_cast<const int16_t*>(col_tmp);
-            break;
-          }
-          case 4: {
-            int32_t* int_col_output   = reinterpret_cast<int32_t*>(col_output);
-            int_col_output[row_index] = *reinterpret_cast<const int32_t*>(col_tmp);
-            break;
-          }
-          case 8: {
-            int64_t* long_col_output   = reinterpret_cast<int64_t*>(col_output);
-            long_col_output[row_index] = *reinterpret_cast<const int64_t*>(col_tmp);
-            break;
-          }
-          default: {
-            cudf::size_type output_offset = col_size * row_index;
-            // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
-              col_output[b + output_offset] = col_tmp[b];
-            }
-            break;
-          }
-        }
-
-        cudf::bitmask_type* nm          = output_nm[col_index];
-        int8_t* valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
-        int predicate                   = *valid_byte & (1 << byte_bit_offset);
-        uint32_t bitmask                = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
-      }  // end column loop
-    }    // end row copy
-    // wait for the row_group to be totally copied before starting on the next row group
-    __syncthreads();
-  }
-}
-
-__global__ void copy_from_fixed_width_columns(const cudf::size_type start_row,
-                                              const cudf::size_type num_rows,
-                                              const cudf::size_type num_columns,
-                                              const cudf::size_type row_size,
-                                              const cudf::size_type* output_offset_in_row,
-                                              const cudf::size_type* num_bytes,
-                                              const int8_t** input_data,
-                                              const cudf::bitmask_type** input_nm,
-                                              int8_t* output_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // We do not support copying a subset of the columns in a row yet, so we don't
-  // currently support a row that is wider than shared memory.
-  // For simplicity we will refer to this as a row_group
-
-  // In practice we have found reading more than 4 columns of data per thread
-  // results in performance loss. As such we are using a 2 dimensional
-  // kernel in terms of threads, but not in terms of blocks. Columns are
-  // controlled by the y dimension (there is no y dimension in blocks). Rows
-  // are controlled by the x dimension (there are multiple blocks in the x
-  // dimension).
-
-  cudf::size_type rows_per_group   = blockDim.x;
-  cudf::size_type row_group_start  = blockIdx.x;
-  cudf::size_type row_group_stride = gridDim.x;
-  cudf::size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
-
-  extern __shared__ int8_t shared_data[];
-
-  // Because we are copying fixed width only data and we stride the rows
-  // this thread will always start copying to shared data in the same place
-  int8_t* row_tmp = &shared_data[row_size * threadIdx.x];
-  int8_t* row_vld_tmp =
-    &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
-
-  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
-       row_group_index += row_group_stride) {
-    // Within the row group there should be 1 thread for each row.  This is a
-    // requirement for launching the kernel
-    cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
-    // But we might not use all of the threads if the number of rows does not go
-    // evenly into the thread count. We don't want those threads to exit yet
-    // because we may need them to copy data back out.
-    if (row_index < (start_row + num_rows)) {
-      cudf::size_type col_index_start  = threadIdx.y;
-      cudf::size_type col_index_stride = blockDim.y;
-      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
-           col_index += col_index_stride) {
-        cudf::size_type col_size = num_bytes[col_index];
-        int8_t* col_tmp          = &(row_tmp[output_offset_in_row[col_index]]);
-        const int8_t* col_input  = input_data[col_index];
-        switch (col_size) {
-          case 1: {
-            *col_tmp = col_input[row_index];
-            break;
-          }
-          case 2: {
-            const int16_t* short_col_input       = reinterpret_cast<const int16_t*>(col_input);
-            *reinterpret_cast<int16_t*>(col_tmp) = short_col_input[row_index];
-            break;
-          }
-          case 4: {
-            const int32_t* int_col_input         = reinterpret_cast<const int32_t*>(col_input);
-            *reinterpret_cast<int32_t*>(col_tmp) = int_col_input[row_index];
-            break;
-          }
-          case 8: {
-            const int64_t* long_col_input        = reinterpret_cast<const int64_t*>(col_input);
-            *reinterpret_cast<int64_t*>(col_tmp) = long_col_input[row_index];
-            break;
-          }
-          default: {
-            cudf::size_type input_offset = col_size * row_index;
-            // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
-              col_tmp[b] = col_input[b + input_offset];
-            }
-            break;
-          }
-        }
-        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
-        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t* valid_byte              = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
-        uint64_t fixup_bytes            = reinterpret_cast<uint64_t>(valid_byte) % 4;
-        int32_t* valid_int              = reinterpret_cast<int32_t*>(valid_byte - fixup_bytes);
-        cudf::size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
-        // Now copy validity for the column
-        if (input_nm[col_index]) {
-          if (bit_is_set(input_nm[col_index], row_index)) {
-            atomicOr_block(valid_int, 1 << int_bit_offset);
-          } else {
-            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
-          }
-        } else {
-          // It is valid so just set the bit
-          atomicOr_block(valid_int, 1 << int_bit_offset);
-        }
-      }  // end column loop
-    }    // end row copy
-    // wait for the row_group to be totally copied into shared memory
-    __syncthreads();
-
-    // Step 2: Copy the data back out
-    // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t* long_shared = reinterpret_cast<int64_t*>(shared_data);
-    int64_t* long_output = reinterpret_cast<int64_t*>(output_data);
-
-    cudf::size_type shared_input_index  = threadIdx.x + (threadIdx.y * blockDim.x);
-    cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
-    cudf::size_type row_index_end       = ((row_group_index + 1) * rows_per_group);
-    if (row_index_end > num_rows) { row_index_end = num_rows; }
-    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    cudf::size_type shared_length     = row_size * num_rows_in_group;
-
-    cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
-
-    cudf::size_type start_output_index =
-      (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
-
-    for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
-         shared_index += shared_input_stride) {
-      long_output[start_output_index + shared_index] = long_shared[shared_index];
-    }
-    __syncthreads();
-    // Go for the next round
-  }
-}
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-struct block_info {
-  int start_col;
-  int start_row;
-  int end_col;
-  int end_row;
-  int buffer_num;
-
-  __host__ __device__ size_type get_shared_row_size(size_type const* const col_offsets,
-                                                    size_type const* const col_sizes,
-                                                    bool debug_print = false) const
-  {
-    if (debug_print)
-      printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n",
-             end_col,
-             &col_offsets[end_col],
-             end_col,
-             &col_sizes[end_col],
-             start_col,
-             &col_offsets[start_col],
-             col_offsets[end_col],
-             col_sizes[end_col],
-             col_offsets[start_col]);
-    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
-  }
-  __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
-
-  __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
-};
-
-// When building the columns to return, we have to be mindful of the offset limit in cudf.
-// It is 32-bit and these data columns are capable of surpassing that easily. The data should
-// not be cut off exactly at the limit though due to the validity buffers. The most efficient
-// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-// we keep track of the cut points for the validity, which we call row batches. If the row
-// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-// the outgoing data stream.
-struct row_batch {
-  size_type num_bytes;
-  size_type row_count;
-};
-
-/**
- * @brief copy data from cudf columns into x format, which is row-based
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param input_data pointer to raw table data
- * @param input_nm pointer to validity data
- * @param col_sizes array of sizes for each element in a column - one per column
- * @param col_offsets offset into input data row for each column's start
- * @param block_infos information about the blocks of work
- * @param row_offsets offset to a specific row in the input data
- * @param output_data pointer to output data
- *
- */
-__global__ void copy_from_columns(const size_type num_rows,
-                                  const size_type num_columns,
-                                  const size_type shmem_used_per_block,
-                                  const size_type num_block_infos,
-                                  const int8_t** input_data,
-                                  const size_type* col_sizes,
-                                  const size_type* col_offsets,
-                                  const block_info* block_infos,
-                                  const size_type* row_offsets,
-                                  int8_t** output_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // This has been broken up for us in the block_info struct, so we don't have
-  // any calculation to do here, but it is important to note.
-
-  constexpr bool debug_print = false;  // blockIdx.x == 0 && threadIdx.x == 1;
-
-  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
-  auto group                      = cooperative_groups::this_thread_block();
-  extern __shared__ int8_t shared_data[];
-  int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
-
-  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
-  if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&block_barrier[i], group.size());
-    }
-  }
-
-  group.sync();
-
-  if (debug_print) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("col sizes at %p, col offsets at %p, and row offsets at %p\n",
-           col_sizes,
-           col_offsets,
-           row_offsets);
-    printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x);
-    printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]);
-    printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
-    printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
-    printf("group is %d threads\n", group.size());
-  }
-  //  else { return; }
-
-  auto const blocks_remaining =
-    std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS,
-             (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS);
-
-  size_t fetch;
-  size_t subset;
-  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
-    // Fetch ahead up to stages_count subsets
-    for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
-      if (debug_print)
-        printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch);
-      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch];
-      if (debug_print)
-        printf("block %lu rows %d-%d and cols %d-%d\n",
-               blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch,
-               fetch_block.start_row,
-               fetch_block.end_row,
-               fetch_block.start_col,
-               fetch_block.end_col);
-
-      auto const num_fetch_cols         = fetch_block.num_cols();
-      auto const num_fetch_rows         = fetch_block.num_rows();
-      auto const num_elements_in_block  = num_fetch_cols * num_fetch_rows;
-      auto const fetch_block_row_size   = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-      auto const starting_column_offset = col_offsets[fetch_block.start_col];
-      auto& fetch_barrier               = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
-
-      // wait for the last use of the memory to be completed
-      if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); }
-
-      // to do the copy we need to do n column copies followed by m element copies OR
-      // we have to do m element copies followed by r row copies. When going from column
-      // to row it is much easier to copy by elements first otherwise we would need a running
-      // total of the column sizes for our block, which isn't readily available. This makes it
-      // more appealing to copy element-wise from input data into shared matching the end layout
-      // and do row-based memcopies out.
-
-      auto const shared_buffer_base = shared[fetch % stages_count];
-      for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
-        auto const relative_col = el / num_fetch_rows;
-        auto const relative_row = el % num_fetch_rows;
-        auto const absolute_col = relative_col + fetch_block.start_col;
-        auto const absolute_row = relative_row + fetch_block.start_row;
-        if (debug_print)
-          printf("row %d(%d), col %d(%d), %d fetch rows, element %d\n",
-                 relative_row,
-                 absolute_row,
-                 relative_col,
-                 absolute_col,
-                 num_fetch_rows,
-                 el);
-        auto const col_size            = col_sizes[absolute_col];
-        auto const col_offset          = col_offsets[absolute_col];
-        auto const relative_col_offset = col_offset - starting_column_offset;
-
-        auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
-        auto const input_src     = input_data[absolute_col] + col_size * absolute_row;
-
-        if (debug_print)
-          printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n",
-                 fetch,
-                 fetch % stages_count,
-                 &shared_buffer_base[shared_offset],
-                 input_src,
-                 *input_src,
-                 col_size);
-
-        // copy the element from global memory
-        switch (col_size) {
-          case 2:
-            cuda::memcpy_async(&shared_buffer_base[shared_offset],
-                               input_src,
-                               cuda::aligned_size_t<2>(col_size),
-                               fetch_barrier);
-            break;
-          case 4:
-            cuda::memcpy_async(&shared_buffer_base[shared_offset],
-                               input_src,
-                               cuda::aligned_size_t<4>(col_size),
-                               fetch_barrier);
-            break;
-          case 8:
-            cuda::memcpy_async(&shared_buffer_base[shared_offset],
-                               input_src,
-                               cuda::aligned_size_t<8>(col_size),
-                               fetch_barrier);
-            break;
-          default:
-            cuda::memcpy_async(
-              &shared_buffer_base[shared_offset], input_src, col_size, fetch_barrier);
-            break;
-        }
-      }
-    }
-
-    auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
-    subset_barrier.arrive_and_wait();
-
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
-    if (debug_print)
-      printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset);
-
-    auto const block_row_size      = block.get_shared_row_size(col_offsets, col_sizes);
-    auto const column_offset       = col_offsets[block.start_col];
-    auto const block_output_buffer = output_data[block.buffer_num];
-
-    // copy entire rows to final dest
-    for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
-         absolute_row += blockDim.x) {
-      auto const relative_row = absolute_row - block.start_row;
-      auto const output_dest  = block_output_buffer + row_offsets[absolute_row] + column_offset;
-      if (debug_print)
-        printf("processing row %d\noutput data[%d] is address %p\n",
-               absolute_row,
-               absolute_row,
-               output_dest);
-      auto const shared_offset = block_row_size * relative_row;
-      if (debug_print)
-        printf("memcpy %p <- %p - %d bytes which is row %d\n",
-               output_dest,
-               &shared[subset % stages_count][shared_offset],
-               block_row_size,
-               absolute_row);
-
-      cuda::memcpy_async(output_dest,
-                         &shared[subset % stages_count][shared_offset],
-                         cuda::aligned_size_t<8>(block_row_size),
-                         subset_barrier);
-    }
-  }
-
-  // wait on the last copies to complete
-  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-    block_barrier[i].arrive_and_wait();
-  }
-}
-
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
- * @param output_data pointer to output data, partitioned by data size
- * @param validity_offsets offset into input data row for validity data
- * @param block_infos information about the blocks of work
- * @param num_block_infos number of infos in blocks array
- * @param input_data pointer to input data
- *
- */
-__global__ void copy_validity_from_columns(const size_type num_rows,
-                                           const size_type num_columns,
-                                           const size_type shmem_used_per_block,
-                                           const size_type* row_offsets,
-                                           int8_t** output_data,
-                                           const size_type validity_offset,
-                                           const block_info* block_infos,
-                                           const size_type num_block_infos,
-                                           const bitmask_type** input_nm)
-{
-  extern __shared__ int8_t shared_data[];
-  int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-    shared_data, shared_data + shmem_used_per_block / 2};
-
-  constexpr bool print_debug = false;  // threadIdx.x==0 && blockIdx.x == 0;
-  //  if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return;
-  if (print_debug) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("%d %d - block infos are at %p and my index is %d\n",
-           threadIdx.x,
-           blockIdx.x,
-           block_infos,
-           blockIdx.x);
-    printf("%d %d - input nm is %p, input_nm[0] is at %p\n",
-           threadIdx.x,
-           blockIdx.x,
-           input_nm,
-           input_nm[0]);
-    printf("shared memory is %p to %p\n", shared_data, shared_data + shmem_used_per_block * 2);
-    printf("block infos at %p and this is index %d\n",
-           &block_infos,
-           blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + 0);
-    /*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-    printf("%d: %d\n", i, row_offsets[i]);
-    }*/
-  }
-  // else { return; }
-
-  // per conversation with DaveB
-  // each thread of warp reads a single int32 of validity - so we read 128 bytes
-  // then ballot_sync the bits and write the result to shmem
-  // after we fill shared mem memcpy it out in a blob.
-  // probably need knobs for number of rows vs columns to balance read/write
-  auto group = cooperative_groups::this_thread_block();
-
-  int const blocks_remaining =
-    std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-             (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
-
-  if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size());
-
-  __shared__ cuda::barrier<cuda::thread_scope_block>
-    shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
-  if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&shared_block_barriers[i], group.size());
-    }
-  }
-
-  group.sync();
-
-  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
-      if (print_debug)
-        printf("%d: waiting at barrier %d\n",
-               threadIdx.x,
-               validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED);
-      shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
-        .arrive_and_wait();
-      if (print_debug) printf("past barrier...\n");
-    }
-    int8_t* this_shared_block = shared_blocks[validity_block % 2];
-    if (print_debug) printf("top of loop for validity block %d\n", validity_block);
-    if (print_debug)
-      printf("reading validity block info %d at %p\n",
-             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
-             &block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]);
-    auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
-
-    auto const num_block_cols = block.num_cols();
-    auto const num_block_rows = block.num_rows();
-
-    auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
-    auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
-    auto const validity_data_row_length =
-      align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
-    auto const total_sections = num_sections_x * num_sections_y;
-
-    if (print_debug) {
-      printf("%d %d - block %d has %d cols, %d rows, %d row length, and %d total sections\n",
-             threadIdx.x,
-             blockIdx.x,
-             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
-             num_block_cols,
-             num_block_rows,
-             validity_data_row_length,
-             total_sections);
-    }
-    int const warp_id          = threadIdx.x / detail::warp_size;
-    int const lane_id          = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
-
-    if (print_debug)
-      printf(
-        "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, "
-        "blockDim.x=%d, "
-        "warp size "
-        "%d\n",
-        threadIdx.x,
-        blockIdx.x,
-        warp_id,
-        total_sections,
-        num_sections_x,
-        num_sections_y,
-        warps_per_block,
-        blockDim.x,
-        detail::warp_size);
-    // the block is divided into sections. A warp operates on a section at a time.
-    for (int my_section_idx = warp_id; my_section_idx < total_sections;
-         my_section_idx += warps_per_block) {
-      // convert to rows and cols
-      auto const section_x = my_section_idx % num_sections_x;
-      auto const section_y = my_section_idx / num_sections_x;
-
-      if (print_debug)
-        printf("working on section %d,%d - %d of %d...\n",
-               section_x,
-               section_y,
-               my_section_idx,
-               total_sections);
-      auto const relative_col = section_x * 32 + lane_id;
-      auto const relative_row = section_y * 32;
-      auto const absolute_col = relative_col + block.start_col;
-      auto const absolute_row = relative_row + block.start_row;
-      auto const cols_left    = num_columns - absolute_col;
-
-      if (print_debug) printf("pre ballot sync...\n");
-      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
-
-      if (print_debug)
-        printf(
-          "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d "
-          "absolute)\n",
-          participation_mask,
-          relative_row,
-          absolute_row,
-          relative_col,
-          absolute_col);
-
-      if (absolute_col < num_columns) {
-        if (print_debug)
-          printf(
-            "thread %d's byte is at %p, participation mask is 0x%x for relative row %d(%d real), "
-            "relative col %d(%d absolute)\n",
-            threadIdx.x,
-            &input_nm[absolute_col][absolute_row / 32],
-            participation_mask,
-            relative_row,
-            absolute_row,
-            relative_col,
-            absolute_col);
-        auto my_data = input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32]
-                                                         : std::numeric_limits<uint32_t>::max();
-
-        if (print_debug)
-          printf(
-            "thread %d's bytes are 0x%x, participation mask is 0x%x for relative row %d(%d real), "
-            "relative col %d(%d absolute)\n",
-            threadIdx.x,
-            my_data,
-            participation_mask,
-            relative_row,
-            absolute_row,
-            relative_col,
-            absolute_col);
-
-        // every thread that is participating in the warp has a byte, but it's column-based
-        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
-        // make the bytes we actually write.
-        bitmask_type dw_mask = 1;
-        for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
-          auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
-          // lead thread in each warp writes data
-          auto const validity_write_offset =
-            validity_data_row_length * (relative_row + i) + relative_col / 8;
-          if (threadIdx.x % detail::warp_size == 0) {
-            if (print_debug)
-              printf(
-                "%d %d - byte_mask is 0x%x, masked_byte is 0x%x, shared_data_block[%d][%d] = "
-                "0x%x\n",
-                threadIdx.x,
-                blockIdx.x,
-                dw_mask,
-                my_data & dw_mask,
-                validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
-                validity_write_offset,
-                validity_data);
-            if (cols_left <= 8) {
-              // write byte
-              if (print_debug)
-                printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n",
-                       threadIdx.x,
-                       blockIdx.x,
-                       validity_write_offset,
-                       &this_shared_block[validity_write_offset]);
-              this_shared_block[validity_write_offset] = validity_data & 0xFF;
-            } else if (cols_left <= 16) {
-              // write int16
-              if (print_debug)
-                printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n",
-                       threadIdx.x,
-                       blockIdx.x,
-                       validity_write_offset,
-                       &this_shared_block[validity_write_offset]);
-              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data & 0xFFFF;
-            } else if (cols_left <= 24) {
-              // write int16 and then int8
-              if (print_debug)
-                printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n",
-                       threadIdx.x,
-                       blockIdx.x,
-                       validity_write_offset,
-                       &this_shared_block[validity_write_offset]);
-              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data & 0xFFFF;
-              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-            } else {
-              // write int32
-              if (print_debug)
-                printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n",
-                       threadIdx.x,
-                       blockIdx.x,
-                       validity_write_offset,
-                       &this_shared_block[validity_write_offset]);
-              *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data;
-            }
-          }
-        }
-      }
-    }
-
-    // make sure entire block has finished copy
-    group.sync();
-
-    auto const output_data_base =
-      output_data[block.buffer_num] + validity_offset + block.start_col / 8;
-
-    // now async memcpy the shared memory out to the final destination
-    for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
-      auto const relative_row = row - block.start_row;
-      if (print_debug) {
-        printf(
-          "base output data is %p, row offset is 0x%x, validity offset into row is 0x%x, word "
-          "index of block start is 0x%x\n",
-          output_data[block.buffer_num],
-          row_offsets[row],
-          validity_offset,
-          word_index(block.start_col));
-        printf(
-          "%d %d - row %d/%d/%d col %d-%d - %p = shared_data_block[%d][%d] which is %p -  %d "
-          "bytes\n - %p <- 0x%x\n",
-          threadIdx.x,
-          blockIdx.x,
-          block.start_row,
-          row,
-          block.end_row,
-          block.start_col,
-          block.end_col,
-          output_data[block.buffer_num] + row_offsets[row] + validity_offset +
-            (word_index(block.start_col)),
-          validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED,
-          validity_data_row_length * relative_row,
-          &this_shared_block[validity_data_row_length * relative_row],
-          util::div_rounding_up_unsafe(num_block_cols, 8),
-          output_data[block.buffer_num] + row_offsets[row] + validity_offset +
-            word_index(block.start_col),
-          this_shared_block[validity_data_row_length * relative_row]);
-      }
-      auto const output_ptr = output_data_base + row_offsets[row];
-      auto const num_bytes  = util::div_rounding_up_unsafe(num_block_cols, 8);
-
-      cuda::memcpy_async(
-        output_ptr,
-        &this_shared_block[validity_data_row_length * relative_row],
-        num_bytes,
-        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-    }
-  }
-
-  // wait for last blocks of data to arrive
-  for (int validity_block = 0;
-       validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-       ++validity_block) {
-    shared_block_barriers[validity_block].arrive_and_wait();
-  }
-}
-
-static __device__ std::tuple<size_type, size_type> get_admin_data_sizes(size_t col_size_size,
-                                                                        size_t col_offset_size,
-                                                                        int const num_cols)
-{
-  auto const col_size_bytes   = num_cols * col_size_size;
-  auto const col_offset_bytes = num_cols * col_offset_size;
-
-  return {col_size_bytes, col_offset_bytes};
-}
-
-/**
- * @brief ensure `read_ahead` buffer blocks are fetched
- *
- * @param fetch_index internal state passed into the function
- * @param processing_index index where processing is occuring
- * @param read_ahead_count how many blocks to read ahead
- * @param max_resident_blocks how many blocks can be loaded at once
- * @param total_blocks total number of blocks overall
- * @param block_infos pointer to the block infos
- * @param col_sizes pointer to column size information
- * @param col_offsets pointer to the table's column offsets
- * @param row_offsets pointer to offsets for each row in the table
- * @param input_data pointer to the input data
- * @param shared pointer to shared memory
- * @param group thread group participating in the fetch
- * @param block_barrier barriers used for each block
- * @param debug_print
- * @return
- */
-static __device__ void fetch_blocks_for_row_to_column(
-  size_t& fetch_index,
-  size_t const processing_index,
-  int const read_ahead_count,
-  int const max_resident_blocks,
-  int const total_blocks,
-  block_info const* const block_infos,
-  size_type const* const col_sizes,
-  size_type const* const col_offsets,
-  size_type const* const row_offsets,
-  int8_t const* const input_data,
-  int8_t* shared[],
-  cooperative_groups::thread_block const group,
-  cuda::barrier<cuda::thread_scope_block>* block_barrier,
-  bool debug_print)
-{
-  for (; fetch_index < static_cast<size_t>(total_blocks) &&
-         fetch_index < (processing_index + read_ahead_count);
-       ++fetch_index) {
-    auto const fetch_block =
-      block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
-    if (debug_print)
-      printf(
-        "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, "
-        "ending "
-        "offset %p\n",
-        blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index,
-        total_blocks,
-        fetch_block.start_col,
-        fetch_block.end_col,
-        &col_offsets[fetch_block.start_col],
-        &col_offsets[fetch_block.end_col]);
-    auto const fetch_block_start_row = fetch_block.start_row;
-    auto const fetch_block_end_row   = fetch_block.end_row;
-    auto const starting_col_offset   = col_offsets[fetch_block.start_col];
-
-    auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-    auto const num_fetch_cols       = fetch_block.num_cols();
-    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-      sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
-    auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
-
-    // if we have fetched all buffers, we need to wait for processing
-    // to complete on them before we can use them again
-    if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); }
-
-    auto shared_row_offset = 0;
-    // copy the data for column sizes
-    if (debug_print)
-      printf("%d: col sizes memcpy_async(group, %p, %p, %d, barrier);\n",
-             threadIdx.x,
-             &shared[fetch_index % max_resident_blocks][shared_row_offset],
-             &col_offsets[fetch_block.start_col],
-             col_size_bytes);
-    if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0)
-      printf("%d-%d fetching to %p with barrier %p\n",
-             threadIdx.x,
-             blockIdx.x,
-             shared[fetch_index % max_resident_blocks],
-             &fetch_barrier);
-    cuda::memcpy_async(group,
-                       &shared[fetch_index % max_resident_blocks][shared_row_offset],
-                       &col_sizes[fetch_block.start_col],
-                       col_size_bytes,
-                       fetch_barrier);
-    shared_row_offset += col_size_bytes;
-    // copy the data for column offsets
-    if (debug_print)
-      printf("%d: offsets memcpy_async(group, %p, %p, %d, barrier);\n",
-             threadIdx.x,
-             &shared[fetch_index % max_resident_blocks][shared_row_offset],
-             &col_offsets[fetch_block.start_col],
-             col_offset_bytes);
-    cuda::memcpy_async(group,
-                       &shared[fetch_index % max_resident_blocks][shared_row_offset],
-                       &col_offsets[fetch_block.start_col],
-                       col_offset_bytes,
-                       fetch_barrier);
-    shared_row_offset += col_offset_bytes;
-    shared_row_offset = align_offset(shared_row_offset, 8);
-
-    for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
-         row <= fetch_block_end_row;
-         row += blockDim.x) {
-      auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
-      if (debug_print)
-        printf(
-          "%d - fetching block %lu to shared chunk %lu. %p(shared[%d %% %d][%d]) <- %p(row %d, row "
-          "offset %d starting col offset %d)\n",
-          threadIdx.x,
-          fetch_index,
-          fetch_index % max_resident_blocks,
-          &shared[fetch_index % max_resident_blocks][shared_offset],
-          (int)fetch_index,
-          max_resident_blocks,
-          shared_offset,
-          &input_data[row_offsets[row] + starting_col_offset],
-          row,
-          row_offsets[row],
-          starting_col_offset);
-      // copy the main
-      cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
-                         &input_data[row_offsets[row] + starting_col_offset],
-                         fetch_block_row_size,
-                         fetch_barrier);
-    }
-  }
-}
-
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param row_offsets
- * @param output_data
- * @param output_nm
- * @param col_sizes array of sizes for each element in a column - one per column
- * @param col_offsets offset into input data row for each column's start
- * @param block_infos information about the blocks of work
- * @param input_data pointer to input data
- *
- */
-__global__ void copy_to_columns(const size_type num_rows,
-                                const size_type num_columns,
-                                const size_type shmem_used_per_block,
-                                const size_type* row_offsets,
-                                int8_t** output_data,
-                                const size_type* _col_sizes,
-                                const size_type* _col_offsets,
-                                const block_info* block_infos,
-                                const size_type num_block_infos,
-                                const int8_t* input_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // This has been broken up for us in the block_info struct, so we don't have
-  // any calculation to do here, but it is important to note.
-
-  // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
-  // to shared memory for each of the blocks that we work on
-
-  constexpr bool debug_print      = false;  // threadIdx.x == 2 && blockIdx.x == 0;
-  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
-  auto group                      = cooperative_groups::this_thread_block();
-  extern __shared__ int8_t shared_data[];
-  int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
-
-  if (debug_print) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf(
-      "%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x);
-    /*    printf("Row Offsets:\n");
-    for (int i=0; i<num_rows; ++i) {
-    printf("%d: %d\n", i, row_offsets[i]);
-    }*/
-    printf("Row offsets are at %p\n", row_offsets);
-    printf("col sizes are at %p and col offsets at %p\n", _col_sizes, _col_offsets);
-    printf("output data to %p and input data at %p\n",
-           output_data[block_infos[blockIdx.x].buffer_num],
-           input_data);
-    printf("shared memory pointers are %p and %p\n", shared[0], shared[1]);
-    printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]);
-    printf("group is %d threads\n", group.size());
-  }
-  //  else { return; }
-
-  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
-  if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&block_barrier[i], group.size());
-    }
-  }
-
-  group.sync();
-
-  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
-                                   (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
-
-  auto get_admin_data_sizes = [col_size_size   = sizeof(decltype(*_col_sizes)),
-                               col_offset_size = sizeof(decltype(*_col_offsets))](
-                                int const num_cols,
-                                int const num_rows) -> std::tuple<size_type, size_type> {
-    auto const col_size_bytes   = num_cols * col_size_size;
-    auto const col_offset_bytes = num_cols * col_offset_size;
-
-    return {col_size_bytes, col_offset_bytes};
-  };
-
-  if (debug_print)
-    printf("%d blocks remaining -> %d block infos, %d block index\n",
-           blocks_remaining,
-           num_block_infos,
-           blockIdx.x);
-  size_t fetch;
-  size_t subset;
-  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
-    // Fetch ahead up to stages_count subsets
-    fetch_blocks_for_row_to_column(fetch,
-                                   subset,
-                                   stages_count,
-                                   stages_count,
-                                   blocks_remaining,
-                                   block_infos,
-                                   _col_sizes,
-                                   _col_offsets,
-                                   row_offsets,
-                                   input_data,
-                                   shared,
-                                   group,
-                                   block_barrier,
-                                   debug_print);
-
-    auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
-    // ensure our data is ready
-    if (debug_print)
-      printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
-    subset_barrier.arrive_and_wait();
-
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
-    if (debug_print)
-      printf("%d-%d reading block %lu at address %p\n",
-             threadIdx.x,
-             blockIdx.x,
-             blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset,
-             shared[subset % stages_count]);
-
-    auto const rows_in_block = block.num_rows();
-    auto const cols_in_block = block.num_cols();
-
-    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block);
-    // auto shared_row_offsets = shared[subset];
-    auto shared_col_sizes = reinterpret_cast<size_type*>(shared[subset % stages_count]);
-    auto shared_col_offsets =
-      reinterpret_cast<size_type*>(&shared[subset % stages_count][col_size_bytes]);
-
-    auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
-
-    auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes, debug_print);
-
-    // now we copy from shared memory to final destination.
-    // the data is laid out in rows in shared memory, so the reads
-    // for a column will be "vertical". Because of this and the different
-    // sizes for each column, this portion is handled on row/column basis.
-    // to prevent each thread working on a single row and also to ensure
-    // that all threads can do work in the case of more threads than rows,
-    // we do a global index instead of a double for loop with col/row.
-    for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
-      auto const relative_col = index % cols_in_block;
-      auto const relative_row = index / cols_in_block;
-      auto const absolute_col = relative_col + block.start_col;
-      auto const absolute_row = relative_row + block.start_row;
-
-      if (debug_print)
-        printf("copying for row %d(%d absolute) col %d(%d absolute)\n",
-               relative_row,
-               absolute_row,
-               relative_col,
-               absolute_col);
-
-      auto const shared_memory_row_offset = block_row_size * relative_row;
-      if (debug_print)
-        printf("shared_col_offsets is %p and relative col is %d, making me access %p\n",
-               shared_col_offsets,
-               relative_col,
-               &shared_col_offsets[relative_col]);
-      auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
-                                        shared_memory_row_offset + shared_row_offset;
-      if (debug_print)
-        printf("shared_col_sizes is %p and relative col is %d, making me access %p\n",
-               shared_col_sizes,
-               relative_col,
-               &shared_col_sizes[relative_col]);
-      auto const column_size = shared_col_sizes[relative_col];
-
-      int8_t* shmem_src = &shared[subset % stages_count][shared_memory_offset];
-      int8_t* dst       = &output_data[absolute_col][absolute_row * column_size];
-
-      if (debug_print) {
-        printf(
-           "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, "
-           "shared_mmeory_row_offset: %d, shared_memory_offset: %d,"
-           " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n",
-           relative_col,
-           relative_row,
-           absolute_col,
-           absolute_row,
-           shared_memory_row_offset,
-           shared_memory_offset,
-           column_size,
-           shmem_src,
-           dst/*,
-           *reinterpret_cast<uint32_t*>(shmem_src)*/);
-        printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size);
-      }
-      if (debug_print && absolute_col == 0 && absolute_row == 51) {
-        printf("col0row51(%d bytes) = %p - 0x", column_size, shmem_src);
-        for (int i = 0; i < column_size; ++i) {
-          printf("%x ", shmem_src[i]);
-        }
-        printf("\n");
-      }
-
-      cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
-    }
-    group.sync();
-    if (debug_print)
-      printf(
-        "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier);
-  }
-
-  // wait on the last copies to complete
-  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-    block_barrier[i].arrive_and_wait();
-  }
-}
-
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
- * @param output_nm
- * @param validity_offsets offset into input data row for validity data
- * @param block_infos information about the blocks of work
- * @param num_block_infos number of infos in blocks array
- * @param input_data pointer to input data
- *
- */
-__global__ void copy_validity_to_columns(const size_type num_rows,
-                                         const size_type num_columns,
-                                         const size_type shmem_used_per_block,
-                                         const size_type* row_offsets,
-                                         cudf::bitmask_type** output_nm,
-                                         const size_type validity_offset,
-                                         const block_info* block_infos,
-                                         const size_type num_block_infos,
-                                         const int8_t* input_data)
-{
-  extern __shared__ int8_t shared_data[];
-  int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-    shared_data, shared_data + shmem_used_per_block / 2};
-
-  constexpr bool print_debug = false;  // threadIdx.x == 0 && blockIdx.x == 0;
-  if (print_debug) {
-    printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns);
-    printf("%d %d - block infos are at %p and my index is %d\n",
-           threadIdx.x,
-           blockIdx.x,
-           block_infos,
-           blockIdx.x);
-    printf(
-      "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, "
-      "row "
-      "offsets are %p, block infos at %p\n",
-      threadIdx.x,
-      blockIdx.x,
-      shared_data,
-      shared_data + shmem_used_per_block,
-      input_data,
-      output_nm,
-      row_offsets,
-      block_infos);
-  }
-  // else { return; }
-
-  // per conversation with DaveB
-  // each thread of warp reads a single byte of validity - so we read 32 bytes
-  // then ballot_sync the bits and write the result to shmem
-  // after we fill shared mem memcpy it out in a blob.
-  // probably need knobs for number of rows vs columns to balance read/write
-  auto group = cooperative_groups::this_thread_block();
-
-  int const blocks_remaining =
-    std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-             (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
-
-  if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size());
-
-  __shared__ cuda::barrier<cuda::thread_scope_block>
-    shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
-  if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&shared_block_barriers[i], group.size());
-    }
-  }
-
-  group.sync();
-
-  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-    if (validity_block != validity_index) {
-      shared_block_barriers[validity_index].arrive_and_wait();
-    }
-    int8_t* this_shared_block = shared_blocks[validity_block % 2];
-    auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
-    auto const block_start_col = block.start_col;
-    auto const block_start_row = block.start_row;
-
-    auto const num_block_cols = block.num_cols();
-    auto const num_block_rows = block.num_rows();
-
-    auto const num_sections_x           = (num_block_cols + 7) / 8;
-    auto const num_sections_y           = (num_block_rows + 31) / 32;
-    auto const validity_data_col_length = num_sections_y * 4;  // words to bytes
-    auto const total_sections           = num_sections_x * num_sections_y;
-
-    if (print_debug) {
-      printf("%d %d - block %d has %d cols, %d rows, and %d total sections\n",
-             threadIdx.x,
-             blockIdx.x,
-             blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block,
-             num_block_cols,
-             num_block_rows,
-             total_sections);
-    }
-    int const warp_id          = threadIdx.x / detail::warp_size;
-    int const lane_id          = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
-
-    if (print_debug)
-      printf(
-        "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side "
-        "%d\n",
-        threadIdx.x,
-        blockIdx.x,
-        warp_id,
-        total_sections,
-        warps_per_block,
-        blockDim.x,
-        detail::warp_size);
-    // the block is divided into sections. A warp operates on a section at a time.
-    for (int my_section_idx = warp_id; my_section_idx < total_sections;
-         my_section_idx += warps_per_block) {
-      // convert to rows and cols
-      auto const section_x = my_section_idx % num_sections_x;
-      auto const section_y = my_section_idx / num_sections_x;
-
-      auto const relative_col = section_x * 8;
-      auto const relative_row = section_y * 32 + lane_id;
-      auto const absolute_col = relative_col + block_start_col;
-      auto const absolute_row = relative_row + block_start_row;
-      auto const rows_left    = num_rows - absolute_row;
-
-      /*      if (print_debug)
-              printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n",
-                     threadIdx.x,
-                     blockIdx.x,
-                     my_section_idx,
-                     num_sections_x,
-                     num_sections_y,
-                     section_x,
-                     section_y,
-                     absolute_row,
-                     num_rows,
-                     relative_col,
-                     relative_row);*/
-      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
-
-      if (absolute_row < num_rows) {
-        auto const my_byte =
-          input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
-
-        // so every thread that is participating in the warp has a byte, but it's row-based
-        // data and we need it in column-based. So we shiffle the bits around to make
-        // the bytes we actually write.
-        for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
-             ++i, byte_mask <<= 1) {
-          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
-          // lead thread in each warp writes data
-          if (threadIdx.x % detail::warp_size == 0) {
-            auto const validity_write_offset =
-              validity_data_col_length * (relative_col + i) + relative_row / 8;
-
-            if (print_debug)
-              printf(
-                "%d - Writing validity data for column %d, row %d 0x%x to shared memory location "
-                "%d(%d * (%d + %d) + %d / 8)\n",
-                threadIdx.x,
-                absolute_col + i,
-                absolute_row,
-                validity_data,
-                validity_write_offset,
-                validity_data_col_length,
-                relative_col,
-                i,
-                relative_row);
-
-            if (rows_left <= 8) {
-              // write byte
-              this_shared_block[validity_write_offset] = validity_data & 0xFF;
-            } else if (rows_left <= 16) {
-              // write int16
-              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data & 0xFFFF;
-            } else if (rows_left <= 24) {
-              // write int16 and then int8
-              *reinterpret_cast<int16_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data & 0xFFFF;
-              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-            } else {
-              // write int32
-              *reinterpret_cast<int32_t*>(&this_shared_block[validity_write_offset]) =
-                validity_data;
-            }
-          }
-        }
-      }
-    }
-
-    // make sure entire block has finished copy
-    group.sync();
-
-    // now async memcpy the shared
-    for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
-      auto const relative_col     = col - block.start_col;
-      auto const words_to_copy    = util::div_rounding_up_unsafe(num_block_rows, 32);
-      auto const starting_address = output_nm[col] + word_index(block_start_row);
-
-      if (print_debug)
-        printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier); - 0x%x\n",
-               threadIdx.x,
-               blockIdx.x,
-               col,
-               starting_address,
-               word_index(block_start_row),
-               &this_shared_block[validity_data_col_length * relative_col],
-               words_to_copy * 4,
-               this_shared_block[validity_data_col_length * relative_col]);
-      cuda::memcpy_async(
-        output_nm[col] + word_index(block_start_row),
-        &this_shared_block[validity_data_col_length * relative_col],
-        util::div_rounding_up_unsafe(num_block_rows, 8),
-        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-    }
-  }
-
-  // wait for last blocks of data to arrive
-  auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
-                                    ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED
-                                    : blocks_remaining;
-  for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
-    shared_block_barriers[validity_block].arrive_and_wait();
-  }
-}
-
-#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-/**
- * Calculate the dimensions of the kernel for fixed width only columns.
- * @param [in] num_columns the number of columns being copied.
- * @param [in] num_rows the number of rows being copied.
- * @param [in] size_per_row the size each row takes up when padded.
- * @param [out] blocks the size of the blocks for the kernel
- * @param [out] threads the size of the threads for the kernel
- * @return the size in bytes of shared memory needed for each block.
- */
-static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
-                                        const cudf::size_type num_rows,
-                                        const cudf::size_type size_per_row,
-                                        dim3& blocks,
-                                        dim3& threads)
-{
-  // We have found speed degrades when a thread handles more than 4 columns.
-  // Each block is 2 dimensional. The y dimension indicates the columns.
-  // We limit this to 32 threads in the y dimension so we can still
-  // have at least 32 threads in the x dimension (1 warp) which should
-  // result in better coalescing of memory operations. We also
-  // want to guarantee that we are processing a multiple of 32 threads
-  // in the x dimension because we use atomic operations at the block
-  // level when writing validity data out to main memory, and that would
-  // need to change if we split a word of validity data between blocks.
-  int y_block_size = (num_columns + 3) / 4;  // cudf::util::div_rounding_up_safe(num_columns, 4);
-  if (y_block_size > 32) { y_block_size = 32; }
-  int x_possible_block_size = 1024 / y_block_size;
-  // 48KB is the default setting for shared memory per block according to the cuda tutorials
-  // If someone configures the GPU to only have 16 KB this might not work.
-  int max_shared_size = 48 * 1024;
-  int max_block_size  = max_shared_size / size_per_row;
-  // If we don't have enough shared memory there is no point in having more threads
-  // per block that will just sit idle
-  max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
-  // Make sure that the x dimension is a multiple of 32 this not only helps
-  // coalesce memory access it also lets us do a ballot sync for validity to write
-  // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
-  // dimension is associated with one or more warps, that should correspond to the validity
-  // words directly.
-  int block_size = (max_block_size / 32) * 32;
-  CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
-
-  int num_blocks = (num_rows + block_size - 1) / block_size;
-  if (num_blocks < 1) {
-    num_blocks = 1;
-  } else if (num_blocks > 10240) {
-    // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
-    // but in practice haveing too many can cause some overhead that I don't totally
-    // understand. Playing around with this haveing as little as 600 blocks appears
-    // to be able to saturate memory on V100, so this is an order of magnitude higher
-    // to try and future proof this a bit.
-    num_blocks = 10240;
-  }
-  blocks.x  = num_blocks;
-  blocks.y  = 1;
-  blocks.z  = 1;
-  threads.x = block_size;
-  threads.y = y_block_size;
-  threads.z = 1;
-  return size_per_row * block_size;
-}
-
-/**
- * When converting to rows it is possible that the size of the table was too big to fit
- * in a single column. This creates an output column for a subset of the rows in a table
- * going from start row and containing the next num_rows.  Most of the parameters passed
- * into this function are common between runs and should be calculated once.
- */
-static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
-  const cudf::size_type start_row,
-  const cudf::size_type num_rows,
-  const cudf::size_type num_columns,
-  const cudf::size_type size_per_row,
-  rmm::device_uvector<cudf::size_type>& column_start,
-  rmm::device_uvector<cudf::size_type>& column_size,
-  rmm::device_uvector<const int8_t*>& input_data,
-  rmm::device_uvector<const cudf::bitmask_type*>& input_nm,
-  const cudf::scalar& zero,
-  const cudf::scalar& scalar_size_per_row,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  int64_t total_allocation = size_per_row * num_rows;
-  // We made a mistake in the split somehow
-  CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
-
-  // Allocate and set the offsets row for the byte array
-  std::unique_ptr<cudf::column> offsets =
-    cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
-
-  std::unique_ptr<cudf::column> data =
-    cudf::make_numeric_column(cudf::data_type(cudf::type_id::INT8),
-                              static_cast<cudf::size_type>(total_allocation),
-                              cudf::mask_state::UNALLOCATED,
-                              stream,
-                              mr);
-
-  dim3 blocks;
-  dim3 threads;
-  int shared_size =
-    detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
-
-  copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-    start_row,
-    num_rows,
-    num_columns,
-    size_per_row,
-    column_start.data(),
-    column_size.data(),
-    input_data.data(),
-    input_nm.data(),
-    data->mutable_view().data<int8_t>());
-
-  return cudf::make_lists_column(num_rows,
-                                 std::move(offsets),
-                                 std::move(data),
-                                 0,
-                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-                                 stream,
-                                 mr);
-}
-
-static cudf::data_type get_data_type(const cudf::column_view& v) { return v.type(); }
-
-static inline bool are_all_fixed_width(std::vector<cudf::data_type> const& schema)
-{
-  return std::all_of(
-    schema.begin(), schema.end(), [](const cudf::data_type& t) { return cudf::is_fixed_width(t); });
-}
-
-/**
- * Given a set of fixed width columns, calculate how the data will be laid out in memory.
- * @param [in] schema the types of columns that need to be laid out.
- * @param [out] column_start the byte offset where each column starts in the row.
- * @param [out] column_size the size in bytes of the data for each columns in the row.
- * @return the size in bytes each row needs.
- */
-static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const& schema,
-                                                 std::vector<cudf::size_type>& column_start,
-                                                 std::vector<cudf::size_type>& column_size)
-{
-  // We guarantee that the start of each column is 64-bit aligned so anything can go
-  // there, but to make the code simple we will still do an alignment for it.
-  int32_t at_offset = 0;
-  for (auto col = schema.begin(); col < schema.end(); col++) {
-    cudf::size_type s = cudf::size_of(*col);
-    column_size.emplace_back(s);
-    std::size_t allocation_needed = s;
-    std::size_t alignment_needed  = allocation_needed;  // They are the same for fixed width types
-    at_offset                     = align_offset(at_offset, alignment_needed);
-    column_start.emplace_back(at_offset);
-    at_offset += allocation_needed;
-  }
-
-  // Now we need to add in space for validity
-  // Eventually we can think about nullable vs not nullable, but for now we will just always add
-  // it in
-  int32_t validity_bytes_needed =
-    (schema.size() + 7) / 8;  // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
-  // validity comes at the end and is byte aligned so we can pack more in.
-  at_offset += validity_bytes_needed;
-  // Now we need to pad the end so all rows are 64 bit aligned
-  return align_offset(at_offset, 8);  // 8 bytes (64 bits)
-}
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-template <typename iterator>
-static size_type compute_column_information(iterator begin,
-                                            iterator end,
-                                            std::vector<size_type>& column_starts,
-                                            std::vector<size_type>& column_sizes)  //,
-// std::function<void(T)> nested_type_cb)
-{
-  size_type fixed_width_size_per_row = 0;
-  for (auto cv = begin; cv != end; ++cv) {
-    auto col_type    = std::get<0>(*cv);
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
-
-    // a list or string column will write a single uint64
-    // of data here for offset/length
-    auto col_size = nested_type ? 8 : size_of(col_type);
-
-    // align size for this type
-    std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-    column_starts.push_back(fixed_width_size_per_row);
-    column_sizes.push_back(col_size);
-    fixed_width_size_per_row += col_size;
-  }
-
-  auto validity_offset = fixed_width_size_per_row;
-  column_starts.push_back(validity_offset);
-
-  return fixed_width_size_per_row;
-}
-
-//#define DEBUG
-
-std::vector<detail::block_info> build_validity_block_infos(
-  size_type const& num_columns,
-  size_type const& num_rows,
-  size_type const& shmem_limit_per_block,
-  std::vector<row_batch> const& row_batches)
-{
-  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-  auto const column_stride            = align_offset(
-    [&]() {
-      if (desired_rows_and_columns > num_columns) {
-        // not many columns, group it into 8s and ship it off
-        return std::min(8, num_columns);
-      } else {
-        return util::round_down_safe(desired_rows_and_columns, 8);
-      }
-    }(),
-    8);
-  // we fit as much as we can given the column stride
-  // note that an element in the table takes just 1 bit, but a row with a single
-  // element still takes 8 bytes!
-  auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
-  auto const row_stride    = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
-
-  std::vector<detail::block_info> validity_block_infos;
-  for (int col = 0; col < num_columns; col += column_stride) {
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int row                      = 0;
-    while (row < num_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(row_stride, rows_left_in_batch);
-
-      validity_block_infos.emplace_back(detail::block_info{
-        col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
-      row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  }
-
-  return validity_block_infos;
-}
-
-std::vector<block_info> build_block_infos(std::vector<size_type> const& column_sizes,
-                                          std::vector<size_type> const& column_starts,
-                                          std::vector<row_batch> const& row_batches,
-                                          size_type const total_number_of_rows,
-                                          size_type const& shmem_limit_per_block)
-{
-  std::vector<block_info> block_infos;
-
-  // block infos are organized with the windows going "down" the columns
-  // this provides the most coalescing of memory access
-  int current_window_width     = 0;
-  int current_window_start_col = 0;
-
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
-                        int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch       = row_batches[current_window_row_batch].row_count;
-    int i                        = 0;
-    while (i < total_number_of_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(detail::block_info{
-        start_col,
-        current_window_start_row,
-        end_col,
-        std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
-        current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
-  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-  // would be memory cache line sized access, but since other blocks will read/write the edges
-  // this may not turn out to be overly important. For now, we will attempt to build a square
-  // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
-  // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
-  // trick is that it's in bytes, not rows or columns.
-  size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-  int const window_height            = std::clamp(
-    util::round_up_safe<int>(
-      std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
-               total_number_of_rows),
-      32),
-    1,
-    row_batches[0].row_count);
-#if defined(DEBUG)
-  printf(
-    "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d "
-    "and num_rows is %d, batch row count is %d "
-    "- which makes window height "
-    "%d - admin size is %lu\n",
-    optimal_square_len,
-    (int)column_sizes.size(),
-    optimal_square_len / column_sizes[0],
-    total_number_of_rows,
-    row_batches[0].row_count,
-    window_height,
-    column_sizes.size() * sizeof(size_type) * 2);
-#endif
-
-  auto calc_admin_data_size = [](int num_cols) -> size_type {
-    // admin data is the column sizes and column start information.
-    // this is copied to shared memory as well and needs to be accounted for
-    // in the window calculation.
-    return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
-  };
-
-  int row_size = 0;
-
-  // march each column and build the blocks of appropriate sizes
-  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
-    auto const col_size = column_sizes[col];
-
-    // align size for this type
-    std::size_t alignment_needed = col_size;  // They are the same for fixed width types
-    auto row_size_aligned        = detail::align_offset(row_size, alignment_needed);
-    auto row_size_with_this_col  = row_size_aligned + col_size;
-    auto row_size_with_end_pad   = detail::align_offset(row_size_with_this_col, 8);
-
-    if (row_size_with_end_pad * window_height +
-          calc_admin_data_size(col - current_window_start_col) >
-        shmem_limit_per_block) {
-#if defined(DEBUG)
-      printf(
-        "row size with end pad is %d and admin data is %d, which adds up to %d and that is too "
-        "large for shmem block of %d\n",
-        row_size_with_end_pad,
-        calc_admin_data_size(col - current_window_start_col),
-        row_size_with_end_pad * window_height +
-          calc_admin_data_size(col - current_window_start_col),
-        shmem_limit_per_block);
-      printf(
-        "Window size %d too large at column %d, admin size is %d, bumping back to build windows "
-        "of "
-        "size %d(cols "
-        "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is "
-        "%d) "
-        "for shared mem size %d\n",
-        row_size_with_end_pad * window_height,
-        col,
-        calc_admin_data_size(col - current_window_start_col),
-        row_size * window_height,
-        current_window_start_col,
-        col - 1,
-        window_height,
-        row_size_with_end_pad,
-        row_size,
-        row_size_aligned,
-        shmem_limit_per_block);
-#endif
-      // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
-      row_size =
-        detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-#if defined(DEBUG)
-      printf(
-        "New window starting with offset %d and row size %d to be %d (previous column offset "
-        "%d+%d "
-        "or %d)\n",
-        row_size,
-        col_size,
-        row_size + col_size,
-        column_starts[col - 1],
-        column_sizes[col - 1],
-        column_starts[col - 1] + column_sizes[col - 1]);
-#endif
-      row_size += col_size;  // alignment required for shared memory window boundary to match
+ #include <algorithm>
+ #include <iostream>
+ #include <iterator>
+ #include <limits>
+ #include <tuple>
+ 
+ #include <cooperative_groups.h>
+ #include <type_traits>
+ 
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ #include <cuda/barrier>
+ #endif
+ 
+ #include <cudf/column/column_factories.hpp>
+ #include <cudf/detail/iterator.cuh>
+ #include <cudf/detail/sequence.hpp>
+ #include <cudf/detail/utilities/cuda.cuh>
+ #include <cudf/detail/utilities/integer_utils.hpp>
+ #include <cudf/detail/utilities/vector_factories.hpp>
+ #include <cudf/lists/lists_column_device_view.cuh>
+ #include <cudf/row_conversion.hpp>
+ #include <cudf/scalar/scalar_factories.hpp>
+ #include <cudf/table/table.hpp>
+ #include <cudf/types.hpp>
+ #include <cudf/utilities/bit.hpp>
+ #include <cudf/utilities/error.hpp>
+ #include <cudf/utilities/traits.hpp>
+ #include <rmm/cuda_stream_view.hpp>
+ #include <rmm/device_buffer.hpp>
+ #include <rmm/device_uvector.hpp>
+ #include <rmm/exec_policy.hpp>
+ #include <thrust/binary_search.h>
+ #include <thrust/iterator/counting_iterator.h>
+ #include <thrust/iterator/transform_iterator.h>
+ 
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8;
+ constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
+ constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
+ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
+ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
+ #endif
+ 
+ using cudf::detail::make_device_uvector_async;
+ using rmm::device_uvector;
+ namespace cudf {
+ 
+ namespace detail {
+ 
+ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) {
+   return (offset + alignment - 1) & ~(alignment - 1);
+ }
+ 
+ __global__ void copy_from_rows_fixed_width_optimized(
+     const cudf::size_type num_rows, const cudf::size_type num_columns,
+     const cudf::size_type row_size, const cudf::size_type *input_offset_in_row,
+     const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm,
+     const int8_t *input_data) {
+   // We are going to copy the data in two passes.
+   // The first pass copies a chunk of data into shared memory.
+   // The second pass copies that chunk from shared memory out to the final location.
+ 
+   // Because shared memory is limited we copy a subset of the rows at a time.
+   // For simplicity we will refer to this as a row_group
+ 
+   // In practice we have found writing more than 4 columns of data per thread
+   // results in performance loss. As such we are using a 2 dimensional
+   // kernel in terms of threads, but not in terms of blocks. Columns are
+   // controlled by the y dimension (there is no y dimension in blocks). Rows
+   // are controlled by the x dimension (there are multiple blocks in the x
+   // dimension).
+ 
+   cudf::size_type rows_per_group = blockDim.x;
+   cudf::size_type row_group_start = blockIdx.x;
+   cudf::size_type row_group_stride = gridDim.x;
+   cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+ 
+   extern __shared__ int8_t shared_data[];
+ 
+   // Because we are copying fixed width only data and we stride the rows
+   // this thread will always start copying from shared data in the same place
+   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
+   int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+ 
+   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+        row_group_index += row_group_stride) {
+     // Step 1: Copy the data into shared memory
+     // We know row_size is always aligned with and a multiple of int64_t;
+     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
+     const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
+ 
+     cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
+     cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
+     cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
+     if (row_index_end > num_rows) {
+       row_index_end = num_rows;
+     }
+     cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+     cudf::size_type shared_length = row_size * num_rows_in_group;
+ 
+     cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
+ 
+     cudf::size_type start_input_index =
+         (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+ 
+     for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
+          shared_index += shared_output_stride) {
+       long_shared[shared_index] = long_input[start_input_index + shared_index];
+     }
+     // Wait for all of the data to be in shared memory
+     __syncthreads();
+ 
+     // Step 2 copy the data back out
+ 
+     // Within the row group there should be 1 thread for each row.  This is a
+     // requirement for launching the kernel
+     cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
+     // But we might not use all of the threads if the number of rows does not go
+     // evenly into the thread count. We don't want those threads to exit yet
+     // because we may need them to copy data in for the next row group.
+     uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
+     if (row_index < num_rows) {
+       cudf::size_type col_index_start = threadIdx.y;
+       cudf::size_type col_index_stride = blockDim.y;
+       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+            col_index += col_index_stride) {
+         cudf::size_type col_size = num_bytes[col_index];
+         const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
+         int8_t *col_output = output_data[col_index];
+         switch (col_size) {
+           case 1: {
+             col_output[row_index] = *col_tmp;
+             break;
+           }
+           case 2: {
+             int16_t *short_col_output = reinterpret_cast<int16_t *>(col_output);
+             short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
+             break;
+           }
+           case 4: {
+             int32_t *int_col_output = reinterpret_cast<int32_t *>(col_output);
+             int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
+             break;
+           }
+           case 8: {
+             int64_t *long_col_output = reinterpret_cast<int64_t *>(col_output);
+             long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
+             break;
+           }
+           default: {
+             cudf::size_type output_offset = col_size * row_index;
+             // TODO this should just not be supported for fixed width columns, but just in case...
+             for (cudf::size_type b = 0; b < col_size; b++) {
+               col_output[b + output_offset] = col_tmp[b];
+             }
+             break;
+           }
+         }
+ 
+         cudf::bitmask_type *nm = output_nm[col_index];
+         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+         cudf::size_type byte_bit_offset = col_index % 8;
+         int predicate = *valid_byte & (1 << byte_bit_offset);
+         uint32_t bitmask = __ballot_sync(active_mask, predicate);
+         if (row_index % 32 == 0) {
+           nm[word_index(row_index)] = bitmask;
+         }
+       } // end column loop
+     }   // end row copy
+     // wait for the row_group to be totally copied before starting on the next row group
+     __syncthreads();
+   }
+ }
+ 
+ __global__ void copy_to_rows_fixed_width_optimized(
+     const cudf::size_type start_row, const cudf::size_type num_rows,
+     const cudf::size_type num_columns, const cudf::size_type row_size,
+     const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes,
+     const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) {
+   // We are going to copy the data in two passes.
+   // The first pass copies a chunk of data into shared memory.
+   // The second pass copies that chunk from shared memory out to the final location.
+ 
+   // Because shared memory is limited we copy a subset of the rows at a time.
+   // We do not support copying a subset of the columns in a row yet, so we don't
+   // currently support a row that is wider than shared memory.
+   // For simplicity we will refer to this as a row_group
+ 
+   // In practice we have found reading more than 4 columns of data per thread
+   // results in performance loss. As such we are using a 2 dimensional
+   // kernel in terms of threads, but not in terms of blocks. Columns are
+   // controlled by the y dimension (there is no y dimension in blocks). Rows
+   // are controlled by the x dimension (there are multiple blocks in the x
+   // dimension).
+ 
+   cudf::size_type rows_per_group = blockDim.x;
+   cudf::size_type row_group_start = blockIdx.x;
+   cudf::size_type row_group_stride = gridDim.x;
+   cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+ 
+   extern __shared__ int8_t shared_data[];
+ 
+   // Because we are copying fixed width only data and we stride the rows
+   // this thread will always start copying to shared data in the same place
+   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
+   int8_t *row_vld_tmp =
+       &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+ 
+   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+        row_group_index += row_group_stride) {
+     // Within the row group there should be 1 thread for each row.  This is a
+     // requirement for launching the kernel
+     cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
+     // But we might not use all of the threads if the number of rows does not go
+     // evenly into the thread count. We don't want those threads to exit yet
+     // because we may need them to copy data back out.
+     if (row_index < (start_row + num_rows)) {
+       cudf::size_type col_index_start = threadIdx.y;
+       cudf::size_type col_index_stride = blockDim.y;
+       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+            col_index += col_index_stride) {
+         cudf::size_type col_size = num_bytes[col_index];
+         int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
+         const int8_t *col_input = input_data[col_index];
+         switch (col_size) {
+           case 1: {
+             *col_tmp = col_input[row_index];
+             break;
+           }
+           case 2: {
+             const int16_t *short_col_input = reinterpret_cast<const int16_t *>(col_input);
+             *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
+             break;
+           }
+           case 4: {
+             const int32_t *int_col_input = reinterpret_cast<const int32_t *>(col_input);
+             *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
+             break;
+           }
+           case 8: {
+             const int64_t *long_col_input = reinterpret_cast<const int64_t *>(col_input);
+             *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
+             break;
+           }
+           default: {
+             cudf::size_type input_offset = col_size * row_index;
+             // TODO this should just not be supported for fixed width columns, but just in case...
+             for (cudf::size_type b = 0; b < col_size; b++) {
+               col_tmp[b] = col_input[b + input_offset];
+             }
+             break;
+           }
+         }
+         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
+         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
+         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+         cudf::size_type byte_bit_offset = col_index % 8;
+         uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
+         int32_t *valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
+         cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
+         // Now copy validity for the column
+         if (input_nm[col_index]) {
+           if (bit_is_set(input_nm[col_index], row_index)) {
+             atomicOr_block(valid_int, 1 << int_bit_offset);
+           } else {
+             atomicAnd_block(valid_int, ~(1 << int_bit_offset));
+           }
+         } else {
+           // It is valid so just set the bit
+           atomicOr_block(valid_int, 1 << int_bit_offset);
+         }
+       } // end column loop
+     }   // end row copy
+     // wait for the row_group to be totally copied into shared memory
+     __syncthreads();
+ 
+     // Step 2: Copy the data back out
+     // We know row_size is always aligned with and a multiple of int64_t;
+     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
+     int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
+ 
+     cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
+     cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
+     cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
+     if (row_index_end > num_rows) {
+       row_index_end = num_rows;
+     }
+     cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+     cudf::size_type shared_length = row_size * num_rows_in_group;
+ 
+     cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
+ 
+     cudf::size_type start_output_index =
+         (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+ 
+     for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
+          shared_index += shared_input_stride) {
+       long_output[start_output_index + shared_index] = long_shared[shared_index];
+     }
+     __syncthreads();
+     // Go for the next round
+   }
+ }
+ 
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ 
+ struct block_info {
+   int start_col;
+   int start_row;
+   int end_col;
+   int end_row;
+   int buffer_num;
+ 
+   __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
+                                                     size_type const *const col_sizes) const {
+     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
+   }
+   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
+ 
+   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
+ };
+ 
+ // When building the columns to return, we have to be mindful of the offset limit in cudf.
+ // It is 32-bit and these data columns are capable of surpassing that easily. The data should
+ // not be cut off exactly at the limit though due to the validity buffers. The most efficient
+ // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
+ // we keep track of the cut points for the validity, which we call row batches. If the row
+ // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
+ // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
+ // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
+ // the outgoing data stream.
+ struct row_batch {
+   size_type num_bytes;
+   size_type row_count;
+ };
+ 
+ /**
+  * @brief copy data from cudf columns into x format, which is row-based
+  *
+  * @param num_rows total number of rows in the table
+  * @param num_columns total number of columns in the table
+  * @param input_data pointer to raw table data
+  * @param input_nm pointer to validity data
+  * @param col_sizes array of sizes for each element in a column - one per column
+  * @param col_offsets offset into input data row for each column's start
+  * @param block_infos information about the blocks of work
+  * @param row_offsets offset to a specific row in the input data
+  * @param output_data pointer to output data
+  *
+  */
+ __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
+                              const size_type shmem_used_per_block, const size_type num_block_infos,
+                              const int8_t **input_data, const size_type *col_sizes,
+                              const size_type *col_offsets, const block_info *block_infos,
+                              const size_type *row_offsets, int8_t **output_data) {
+   // We are going to copy the data in two passes.
+   // The first pass copies a chunk of data into shared memory.
+   // The second pass copies that chunk from shared memory out to the final location.
+ 
+   // Because shared memory is limited we copy a subset of the rows at a time.
+   // This has been broken up for us in the block_info struct, so we don't have
+   // any calculation to do here, but it is important to note.
+ 
+   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+   auto group = cooperative_groups::this_thread_block();
+   extern __shared__ int8_t shared_data[];
+   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+ 
+   __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+   if (group.thread_rank() == 0) {
+     for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+       init(&block_barrier[i], group.size());
+     }
+   }
+ 
+   group.sync();
+ 
+   auto const blocks_remaining =
+       std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
+                (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS);
+ 
+   size_t fetch;
+   size_t subset;
+   for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
+     // Fetch ahead up to stages_count subsets
+     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
+       auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch];
+       auto const num_fetch_cols = fetch_block.num_cols();
+       auto const num_fetch_rows = fetch_block.num_rows();
+       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
+       auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
+       auto const starting_column_offset = col_offsets[fetch_block.start_col];
+       auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
+ 
+       // wait for the last use of the memory to be completed
+       if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) {
+         fetch_barrier.arrive_and_wait();
+       }
+ 
+       // to do the copy we need to do n column copies followed by m element copies OR
+       // we have to do m element copies followed by r row copies. When going from column
+       // to row it is much easier to copy by elements first otherwise we would need a running
+       // total of the column sizes for our block, which isn't readily available. This makes it
+       // more appealing to copy element-wise from input data into shared matching the end layout
+       // and do row-based memcopies out.
+ 
+       auto const shared_buffer_base = shared[fetch % stages_count];
+       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
+         auto const relative_col = el / num_fetch_rows;
+         auto const relative_row = el % num_fetch_rows;
+         auto const absolute_col = relative_col + fetch_block.start_col;
+         auto const absolute_row = relative_row + fetch_block.start_row;
+         auto const col_size = col_sizes[absolute_col];
+         auto const col_offset = col_offsets[absolute_col];
+         auto const relative_col_offset = col_offset - starting_column_offset;
+ 
+         auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
+         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
+ 
+         // copy the element from global memory
+         switch (col_size) {
+           case 2:
+             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                                cuda::aligned_size_t<2>(col_size), fetch_barrier);
+             break;
+           case 4:
+             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                                cuda::aligned_size_t<4>(col_size), fetch_barrier);
+             break;
+           case 8:
+             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                                cuda::aligned_size_t<8>(col_size), fetch_barrier);
+             break;
+           default:
+             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size,
+                                fetch_barrier);
+             break;
+         }
+       }
+     }
+ 
+     auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
+     subset_barrier.arrive_and_wait();
+ 
+     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
+     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
+     auto const column_offset = col_offsets[block.start_col];
+     auto const block_output_buffer = output_data[block.buffer_num];
+ 
+     // copy entire rows to final dest
+     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
+          absolute_row += blockDim.x) {
+       auto const relative_row = absolute_row - block.start_row;
+       auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset;
+       auto const shared_offset = block_row_size * relative_row;
+ 
+       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset],
+                          cuda::aligned_size_t<8>(block_row_size), subset_barrier);
+     }
+   }
+ 
+   // wait on the last copies to complete
+   for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+     block_barrier[i].arrive_and_wait();
+   }
+ }
+ 
+ /**
+  * @brief copy data from row-based format to cudf columns
+  *
+  * @param num_rows total number of rows in the table
+  * @param num_columns total number of columns in the table
+  * @param shmem_used_per_block amount of shared memory that is used by a block
+  * @param offsets
+  * @param output_data pointer to output data, partitioned by data size
+  * @param validity_offsets offset into input data row for validity data
+  * @param block_infos information about the blocks of work
+  * @param num_block_infos number of infos in blocks array
+  * @param input_data pointer to input data
+  *
+  */
+ __global__ void copy_validity_to_rows(
+     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
+     const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
+     const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
+   extern __shared__ int8_t shared_data[];
+   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+       shared_data, shared_data + shmem_used_per_block / 2};
+ 
+   // per conversation with DaveB
+   // each thread of warp reads a single int32 of validity - so we read 128 bytes
+   // then ballot_sync the bits and write the result to shmem
+   // after we fill shared mem memcpy it out in a blob.
+   // probably need knobs for number of rows vs columns to balance read/write
+   auto group = cooperative_groups::this_thread_block();
+ 
+   int const blocks_remaining =
+       std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+ 
+   __shared__ cuda::barrier<cuda::thread_scope_block>
+       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+   if (group.thread_rank() == 0) {
+     for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+       init(&shared_block_barriers[i], group.size());
+     }
+   }
+ 
+   group.sync();
+ 
+   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+     if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+       shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
+           .arrive_and_wait();
+     }
+     int8_t *this_shared_block = shared_blocks[validity_block % 2];
+     auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+ 
+     auto const num_block_cols = block.num_cols();
+     auto const num_block_rows = block.num_rows();
+ 
+     auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
+     auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
+     auto const validity_data_row_length =
+         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
+     auto const total_sections = num_sections_x * num_sections_y;
+ 
+     int const warp_id = threadIdx.x / detail::warp_size;
+     int const lane_id = threadIdx.x % detail::warp_size;
+     auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+ 
+     // the block is divided into sections. A warp operates on a section at a time.
+     for (int my_section_idx = warp_id; my_section_idx < total_sections;
+          my_section_idx += warps_per_block) {
+       // convert to rows and cols
+       auto const section_x = my_section_idx % num_sections_x;
+       auto const section_y = my_section_idx / num_sections_x;
+       auto const relative_col = section_x * 32 + lane_id;
+       auto const relative_row = section_y * 32;
+       auto const absolute_col = relative_col + block.start_col;
+       auto const absolute_row = relative_row + block.start_row;
+       auto const cols_left = num_columns - absolute_col;
+       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
+ 
+       if (absolute_col < num_columns) {
+         auto my_data = input_nm[absolute_col] != nullptr ?
+                            input_nm[absolute_col][absolute_row / 32] :
+                            std::numeric_limits<uint32_t>::max();
+ 
+         // every thread that is participating in the warp has a byte, but it's column-based
+         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
+         // make the bytes we actually write.
+         bitmask_type dw_mask = 1;
+         for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
+           auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
+           // lead thread in each warp writes data
+           auto const validity_write_offset =
+               validity_data_row_length * (relative_row + i) + relative_col / 8;
+           if (threadIdx.x % detail::warp_size == 0) {
+             if (cols_left <= 8) {
+               // write byte
+               this_shared_block[validity_write_offset] = validity_data & 0xFF;
+             } else if (cols_left <= 16) {
+               // write int16
+               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data & 0xFFFF;
+             } else if (cols_left <= 24) {
+               // write int16 and then int8
+               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data & 0xFFFF;
+               shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+             } else {
+               // write int32
+               *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data;
+             }
+           }
+         }
+       }
+     }
+ 
+     // make sure entire block has finished copy
+     group.sync();
+ 
+     auto const output_data_base =
+         output_data[block.buffer_num] + validity_offset + block.start_col / 8;
+ 
+     // now async memcpy the shared memory out to the final destination
+     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
+       auto const relative_row = row - block.start_row;
+       auto const output_ptr = output_data_base + row_offsets[row];
+       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+ 
+       cuda::memcpy_async(
+           output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes,
+           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+     }
+   }
+ 
+   // wait for last blocks of data to arrive
+   for (int validity_block = 0;
+        validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+        ++validity_block) {
+     shared_block_barriers[validity_block].arrive_and_wait();
+   }
+ }
+ 
+ static __device__ std::tuple<size_type, size_type>
+ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) {
+   auto const col_size_bytes = num_cols * col_size_size;
+   auto const col_offset_bytes = num_cols * col_offset_size;
+ 
+   return {col_size_bytes, col_offset_bytes};
+ }
+ 
+ /**
+  * @brief copy data from row-based format to cudf columns
+  *
+  * @param num_rows total number of rows in the table
+  * @param num_columns total number of columns in the table
+  * @param shmem_used_per_block amount of shared memory that is used by a block
+  * @param row_offsets
+  * @param output_data
+  * @param output_nm
+  * @param col_sizes array of sizes for each element in a column - one per column
+  * @param col_offsets offset into input data row for each column's start
+  * @param block_infos information about the blocks of work
+  * @param input_data pointer to input data
+  *
+  */
+ __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
+                                 const size_type shmem_used_per_block, const size_type *row_offsets,
+                                 int8_t **output_data, const size_type *_col_sizes,
+                                 const size_type *_col_offsets, const block_info *block_infos,
+                                 const size_type num_block_infos, const int8_t *input_data) {
+   // We are going to copy the data in two passes.
+   // The first pass copies a chunk of data into shared memory.
+   // The second pass copies that chunk from shared memory out to the final location.
+ 
+   // Because shared memory is limited we copy a subset of the rows at a time.
+   // This has been broken up for us in the block_info struct, so we don't have
+   // any calculation to do here, but it is important to note.
+ 
+   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
+   // to shared memory for each of the blocks that we work on
+ 
+   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+   auto group = cooperative_groups::this_thread_block();
+   extern __shared__ int8_t shared_data[];
+   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+ 
+   __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+   if (group.thread_rank() == 0) {
+     for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
+       init(&block_barrier[i], group.size());
+     }
+   }
+ 
+   group.sync();
+ 
+   auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
+                                    (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
+ 
+   size_t fetch_index;
+   size_t processing_index;
+   for (processing_index = fetch_index = 0; processing_index < blocks_remaining;
+        ++processing_index) {
+     // Fetch ahead up to stages_count groups
+     for (; fetch_index < static_cast<size_t>(blocks_remaining) &&
+            fetch_index < (processing_index + stages_count);
+          ++fetch_index) {
+       auto const fetch_block =
+           block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index];
+       auto const fetch_block_start_row = fetch_block.start_row;
+       auto const fetch_block_end_row = fetch_block.end_row;
+       auto const starting_col_offset = _col_offsets[fetch_block.start_col];
+       auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes);
+       auto const num_fetch_cols = fetch_block.num_cols();
+       auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+           sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols);
+       auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+ 
+       // if we have fetched all buffers, we need to wait for processing
+       // to complete on them before we can use them again
+       if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
+         fetch_barrier.arrive_and_wait();
+       }
+ 
+       auto shared_row_offset = 0;
+       // copy the data for column sizes
+       cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
+                          &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
+       shared_row_offset += col_size_bytes;
+       // copy the data for column offsets
+       cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
+                          &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
+       shared_row_offset += col_offset_bytes;
+       shared_row_offset = align_offset(shared_row_offset, 8);
+ 
+       for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
+            row <= fetch_block_end_row; row += blockDim.x) {
+         auto shared_offset =
+             (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+         // copy the main
+         cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset],
+                            &input_data[row_offsets[row] + starting_col_offset],
+                            fetch_block_row_size, fetch_barrier);
+       }
+     }
+ 
+     auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+ 
+     // ensure our data is ready
+     processing_barrier.arrive_and_wait();
+ 
+     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index];
+     auto const rows_in_block = block.num_rows();
+     auto const cols_in_block = block.num_cols();
+ 
+     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+         sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block);
+     auto shared_col_sizes = reinterpret_cast<size_type *>(shared[processing_index % stages_count]);
+     auto shared_col_offsets =
+         reinterpret_cast<size_type *>(&shared[processing_index % stages_count][col_size_bytes]);
+ 
+     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
+ 
+     auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes);
+ 
+     // now we copy from shared memory to final destination.
+     // the data is laid out in rows in shared memory, so the reads
+     // for a column will be "vertical". Because of this and the different
+     // sizes for each column, this portion is handled on row/column basis.
+     // to prevent each thread working on a single row and also to ensure
+     // that all threads can do work in the case of more threads than rows,
+     // we do a global index instead of a double for loop with col/row.
+     for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
+       auto const relative_col = index % cols_in_block;
+       auto const relative_row = index / cols_in_block;
+       auto const absolute_col = relative_col + block.start_col;
+       auto const absolute_row = relative_row + block.start_row;
+ 
+       auto const shared_memory_row_offset = block_row_size * relative_row;
+       auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
+                                         shared_memory_row_offset + shared_row_offset;
+       auto const column_size = shared_col_sizes[relative_col];
+ 
+       int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset];
+       int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
+ 
+       cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier);
+     }
+     group.sync();
+   }
+ 
+   // wait on the last copies to complete
+   for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
+     block_barrier[i].arrive_and_wait();
+   }
+ }
+ 
+ /**
+  * @brief copy data from row-based format to cudf columns
+  *
+  * @param num_rows total number of rows in the table
+  * @param num_columns total number of columns in the table
+  * @param shmem_used_per_block amount of shared memory that is used by a block
+  * @param offsets
+  * @param output_nm
+  * @param validity_offsets offset into input data row for validity data
+  * @param block_infos information about the blocks of work
+  * @param num_block_infos number of infos in blocks array
+  * @param input_data pointer to input data
+  *
+  */
+ __global__ void copy_validity_from_rows(
+     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
+     const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
+     const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
+   extern __shared__ int8_t shared_data[];
+   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
+       shared_data, shared_data + shmem_used_per_block / 2};
+ 
+   // per conversation with DaveB
+   // each thread of warp reads a single byte of validity - so we read 32 bytes
+   // then ballot_sync the bits and write the result to shmem
+   // after we fill shared mem memcpy it out in a blob.
+   // probably need knobs for number of rows vs columns to balance read/write
+   auto group = cooperative_groups::this_thread_block();
+ 
+   int const blocks_remaining =
+       std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+ 
+   __shared__ cuda::barrier<cuda::thread_scope_block>
+       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+   if (group.thread_rank() == 0) {
+     for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
+       init(&shared_block_barriers[i], group.size());
+     }
+   }
+ 
+   group.sync();
+ 
+   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
+     auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+     if (validity_block != validity_index) {
+       shared_block_barriers[validity_index].arrive_and_wait();
+     }
+     int8_t *this_shared_block = shared_blocks[validity_block % 2];
+     auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+     auto const block_start_col = block.start_col;
+     auto const block_start_row = block.start_row;
+     auto const num_block_cols = block.num_cols();
+     auto const num_block_rows = block.num_rows();
+     auto const num_sections_x = (num_block_cols + 7) / 8;
+     auto const num_sections_y = (num_block_rows + 31) / 32;
+     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
+     auto const total_sections = num_sections_x * num_sections_y;
+     int const warp_id = threadIdx.x / detail::warp_size;
+     int const lane_id = threadIdx.x % detail::warp_size;
+     auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+ 
+     // the block is divided into sections. A warp operates on a section at a time.
+     for (int my_section_idx = warp_id; my_section_idx < total_sections;
+          my_section_idx += warps_per_block) {
+       // convert to rows and cols
+       auto const section_x = my_section_idx % num_sections_x;
+       auto const section_y = my_section_idx / num_sections_x;
+       auto const relative_col = section_x * 8;
+       auto const relative_row = section_y * 32 + lane_id;
+       auto const absolute_col = relative_col + block_start_col;
+       auto const absolute_row = relative_row + block_start_row;
+       auto const rows_left = num_rows - absolute_row;
+ 
+       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
+ 
+       if (absolute_row < num_rows) {
+         auto const my_byte =
+             input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
+ 
+         // so every thread that is participating in the warp has a byte, but it's row-based
+         // data and we need it in column-based. So we shiffle the bits around to make
+         // the bytes we actually write.
+         for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
+              ++i, byte_mask <<= 1) {
+           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+           // lead thread in each warp writes data
+           if (threadIdx.x % detail::warp_size == 0) {
+             auto const validity_write_offset =
+                 validity_data_col_length * (relative_col + i) + relative_row / 8;
+ 
+             if (rows_left <= 8) {
+               // write byte
+               this_shared_block[validity_write_offset] = validity_data & 0xFF;
+             } else if (rows_left <= 16) {
+               // write int16
+               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data & 0xFFFF;
+             } else if (rows_left <= 24) {
+               // write int16 and then int8
+               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data & 0xFFFF;
+               shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
+             } else {
+               // write int32
+               *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
+                   validity_data;
+             }
+           }
+         }
+       }
+     }
+ 
+     // make sure entire block has finished copy
+     group.sync();
+ 
+     // now async memcpy the shared
+     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
+       auto const relative_col = col - block.start_col;
+ 
+       cuda::memcpy_async(
+           output_nm[col] + word_index(block_start_row),
+           &this_shared_block[validity_data_col_length * relative_col],
+           util::div_rounding_up_unsafe(num_block_rows, 8),
+           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+     }
+   }
+ 
+   // wait for last blocks of data to arrive
+   auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ?
+                                       NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED :
+                                       blocks_remaining;
+   for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
+     shared_block_barriers[validity_block].arrive_and_wait();
+   }
+ }
+ 
+ #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ 
+ /**
+  * Calculate the dimensions of the kernel for fixed width only columns.
+  * @param [in] num_columns the number of columns being copied.
+  * @param [in] num_rows the number of rows being copied.
+  * @param [in] size_per_row the size each row takes up when padded.
+  * @param [out] blocks the size of the blocks for the kernel
+  * @param [out] threads the size of the threads for the kernel
+  * @return the size in bytes of shared memory needed for each block.
+  */
+ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
+                                         const cudf::size_type num_rows,
+                                         const cudf::size_type size_per_row, dim3 &blocks,
+                                         dim3 &threads) {
+   // We have found speed degrades when a thread handles more than 4 columns.
+   // Each block is 2 dimensional. The y dimension indicates the columns.
+   // We limit this to 32 threads in the y dimension so we can still
+   // have at least 32 threads in the x dimension (1 warp) which should
+   // result in better coalescing of memory operations. We also
+   // want to guarantee that we are processing a multiple of 32 threads
+   // in the x dimension because we use atomic operations at the block
+   // level when writing validity data out to main memory, and that would
+   // need to change if we split a word of validity data between blocks.
+   int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4);
+   if (y_block_size > 32) {
+     y_block_size = 32;
+   }
+   int x_possible_block_size = 1024 / y_block_size;
+   // 48KB is the default setting for shared memory per block according to the cuda tutorials
+   // If someone configures the GPU to only have 16 KB this might not work.
+   int max_shared_size = 48 * 1024;
+   int max_block_size = max_shared_size / size_per_row;
+   // If we don't have enough shared memory there is no point in having more threads
+   // per block that will just sit idle
+   max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
+   // Make sure that the x dimension is a multiple of 32 this not only helps
+   // coalesce memory access it also lets us do a ballot sync for validity to write
+   // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
+   // dimension is associated with one or more warps, that should correspond to the validity
+   // words directly.
+   int block_size = (max_block_size / 32) * 32;
+   CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
+ 
+   int num_blocks = (num_rows + block_size - 1) / block_size;
+   if (num_blocks < 1) {
+     num_blocks = 1;
+   } else if (num_blocks > 10240) {
+     // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
+     // but in practice haveing too many can cause some overhead that I don't totally
+     // understand. Playing around with this haveing as little as 600 blocks appears
+     // to be able to saturate memory on V100, so this is an order of magnitude higher
+     // to try and future proof this a bit.
+     num_blocks = 10240;
+   }
+   blocks.x = num_blocks;
+   blocks.y = 1;
+   blocks.z = 1;
+   threads.x = block_size;
+   threads.y = y_block_size;
+   threads.z = 1;
+   return size_per_row * block_size;
+ }
+ 
+ /**
+  * When converting to rows it is possible that the size of the table was too big to fit
+  * in a single column. This creates an output column for a subset of the rows in a table
+  * going from start row and containing the next num_rows.  Most of the parameters passed
+  * into this function are common between runs and should be calculated once.
+  */
+ static std::unique_ptr<cudf::column>
+ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows,
+                             const cudf::size_type num_columns, const cudf::size_type size_per_row,
+                             rmm::device_uvector<cudf::size_type> &column_start,
+                             rmm::device_uvector<cudf::size_type> &column_size,
+                             rmm::device_uvector<const int8_t *> &input_data,
+                             rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
+                             const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row,
+                             rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+   int64_t total_allocation = size_per_row * num_rows;
+   // We made a mistake in the split somehow
+   CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
+ 
+   // Allocate and set the offsets row for the byte array
+   std::unique_ptr<cudf::column> offsets =
+       cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
+ 
+   std::unique_ptr<cudf::column> data = cudf::make_numeric_column(
+       cudf::data_type(cudf::type_id::INT8), static_cast<cudf::size_type>(total_allocation),
+       cudf::mask_state::UNALLOCATED, stream, mr);
+ 
+   dim3 blocks;
+   dim3 threads;
+   int shared_size =
+       detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+ 
+   copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
+       start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
+       input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
+ 
+   return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
+                                  rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
+ }
+ 
+ static cudf::data_type get_data_type(const cudf::column_view &v) {
+   return v.type();
+ }
+ 
+ static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
+   return std::all_of(schema.begin(), schema.end(),
+                      [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
+ }
+ 
+ /**
+  * Given a set of fixed width columns, calculate how the data will be laid out in memory.
+  * @param [in] schema the types of columns that need to be laid out.
+  * @param [out] column_start the byte offset where each column starts in the row.
+  * @param [out] column_size the size in bytes of the data for each columns in the row.
+  * @return the size in bytes each row needs.
+  */
+ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
+                                                  std::vector<cudf::size_type> &column_start,
+                                                  std::vector<cudf::size_type> &column_size) {
+   // We guarantee that the start of each column is 64-bit aligned so anything can go
+   // there, but to make the code simple we will still do an alignment for it.
+   int32_t at_offset = 0;
+   for (auto col = schema.begin(); col < schema.end(); col++) {
+     cudf::size_type s = cudf::size_of(*col);
+     column_size.emplace_back(s);
+     std::size_t allocation_needed = s;
+     std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
+     at_offset = align_offset(at_offset, alignment_needed);
+     column_start.emplace_back(at_offset);
+     at_offset += allocation_needed;
+   }
+ 
+   // Now we need to add in space for validity
+   // Eventually we can think about nullable vs not nullable, but for now we will just always add
+   // it in
+   int32_t validity_bytes_needed =
+       (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
+   // validity comes at the end and is byte aligned so we can pack more in.
+   at_offset += validity_bytes_needed;
+   // Now we need to pad the end so all rows are 64 bit aligned
+   return align_offset(at_offset, 8); // 8 bytes (64 bits)
+ }
+ 
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ 
+ template <typename iterator>
+ static size_type compute_column_information(iterator begin, iterator end,
+                                             std::vector<size_type> &column_starts,
+                                             std::vector<size_type> &column_sizes) //,
+ // std::function<void(T)> nested_type_cb)
+ {
+   size_type fixed_width_size_per_row = 0;
+   for (auto cv = begin; cv != end; ++cv) {
+     auto col_type = std::get<0>(*cv);
+     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+ 
+     //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
+ 
+     // a list or string column will write a single uint64
+     // of data here for offset/length
+     auto col_size = nested_type ? 8 : size_of(col_type);
+ 
+     // align size for this type
+     std::size_t const alignment_needed = col_size; // They are the same for fixed width types
+     fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+     column_starts.push_back(fixed_width_size_per_row);
+     column_sizes.push_back(col_size);
+     fixed_width_size_per_row += col_size;
+   }
+ 
+   auto validity_offset = fixed_width_size_per_row;
+   column_starts.push_back(validity_offset);
+ 
+   return fixed_width_size_per_row;
+ }
+ 
+ std::vector<detail::block_info>
+ build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
+                            size_type const &shmem_limit_per_block,
+                            std::vector<row_batch> const &row_batches) {
+   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+   auto const column_stride = align_offset(
+       [&]() {
+         if (desired_rows_and_columns > num_columns) {
+           // not many columns, group it into 8s and ship it off
+           return std::min(8, num_columns);
+         } else {
+           return util::round_down_safe(desired_rows_and_columns, 8);
+         }
+       }(),
+       8);
+   // we fit as much as we can given the column stride
+   // note that an element in the table takes just 1 bit, but a row with a single
+   // element still takes 8 bytes!
+   auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
+   auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
+ 
+   std::vector<detail::block_info> validity_block_infos;
+   for (int col = 0; col < num_columns; col += column_stride) {
+     int current_window_row_batch = 0;
+     int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+     int row = 0;
+     while (row < num_rows) {
+       if (rows_left_in_batch == 0) {
+         current_window_row_batch++;
+         rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+       }
+       int const window_height = std::min(row_stride, rows_left_in_batch);
+ 
+       validity_block_infos.emplace_back(detail::block_info{
+           col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
+       row += window_height;
+       rows_left_in_batch -= window_height;
+     }
+   }
+ 
+   return validity_block_infos;
+ }
+ 
+ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
+                                           std::vector<size_type> const &column_starts,
+                                           std::vector<row_batch> const &row_batches,
+                                           size_type const total_number_of_rows,
+                                           size_type const &shmem_limit_per_block) {
+   std::vector<block_info> block_infos;
+ 
+   // block infos are organized with the windows going "down" the columns
+   // this provides the most coalescing of memory access
+   int current_window_width = 0;
+   int current_window_start_col = 0;
+ 
+   // build the blocks for a specific set of columns
+   auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
+                           int const start_col, int const end_col, int const desired_window_height) {
+     int current_window_start_row = 0;
+     int current_window_row_batch = 0;
+     int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+     int i = 0;
+     while (i < total_number_of_rows) {
+       if (rows_left_in_batch == 0) {
+         current_window_row_batch++;
+         rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+       }
+       int const window_height = std::min(desired_window_height, rows_left_in_batch);
+ 
+       block_infos.emplace_back(detail::block_info{
+           start_col, current_window_start_row, end_col,
+           std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
+           current_window_row_batch});
+ 
+       i += window_height;
+       current_window_start_row += window_height;
+       rows_left_in_batch -= window_height;
+     }
+   };
+ 
+   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+   // would be memory cache line sized access, but since other blocks will read/write the edges
+   // this may not turn out to be overly important. For now, we will attempt to build a square
+   // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
+   // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
+   // trick is that it's in bytes, not rows or columns.
+   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
+   int const window_height = std::clamp(
+       util::round_up_safe<int>(
+           std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
+                    total_number_of_rows),
+           32),
+       1, row_batches[0].row_count);
+ 
+   auto calc_admin_data_size = [](int num_cols) -> size_type {
+     // admin data is the column sizes and column start information.
+     // this is copied to shared memory as well and needs to be accounted for
+     // in the window calculation.
+     return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
+   };
+ 
+   int row_size = 0;
+ 
+   // march each column and build the blocks of appropriate sizes
+   for (unsigned int col = 0; col < column_sizes.size(); ++col) {
+     auto const col_size = column_sizes[col];
+ 
+     // align size for this type
+     std::size_t alignment_needed = col_size; // They are the same for fixed width types
+     auto row_size_aligned = detail::align_offset(row_size, alignment_needed);
+     auto row_size_with_this_col = row_size_aligned + col_size;
+     auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
+ 
+     if (row_size_with_end_pad * window_height +
+             calc_admin_data_size(col - current_window_start_col) >
+         shmem_limit_per_block) {
+       // too large, close this window, generate vertical blocks and restart
+       build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
+       row_size =
+           detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+       row_size += col_size; // alignment required for shared memory window boundary to match
                              // alignment of output row
-      current_window_start_col = col;
-      current_window_width     = 0;
-    } else {
-      row_size = row_size_with_this_col;
-      current_window_width++;
-    }
-  }
-
-  // build last set of blocks
-  if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
-  }
-
-  return block_infos;
-}
-
-#if defined(DEBUG)
-void pretty_print(uint64_t i)
-{
-  if (i > (1 * 1024 * 1024 * 1024)) {
-    printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024));
-  } else if (i > (1 * 1024 * 1024)) {
-    printf("%.2f MB", i / float(1 * 1024 * 1024));
-  } else if (i > (1 * 1024)) {
-    printf("%.2f KB", float(i / 1024));
-  } else {
-    printf("%lu Bytes", i);
-  }
-}
-#endif
-#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-}  // namespace detail
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const& tbl,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
-{
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
-  const size_type num_columns = tbl.num_columns();
-  const size_type num_rows    = tbl.num_rows();
-
-  int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
-  int total_shmem;
-  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
-
-#if defined(DEBUG) || 1
-  total_shmem -= 1024;
-#endif
-  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-
-#if defined(DEBUG)
-  size_t free, total;
-  cudaMemGetInfo(&free, &total);
-  printf("%lu/%lu Memory\n", free, total);
-#endif
-
-  // break up the work into blocks, which are a starting and ending row/col #.
-  // this window size is calculated based on the shared memory size available
-  // we want a single block to fill up the entire shared memory space available
-  // for the transpose-like conversion.
-
-  // There are two different processes going on here. The GPU conversion of the data
-  // and the writing of the data into the list of byte columns that are a maximum of
-  // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
-  // this limitation because the column must own the data inside and as a result it must be
-  // a distinct allocation for that column. Copying the data into these final buffers would
-  // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
-  // The windows are broken at the boundaries of specific rows based on the row sizes up
-  // to that point. These are row batches and they are decided first before building the
-  // windows so the windows can be properly cut around them.
-
-  // Get the pointers to the input columnar data ready
-  std::vector<int8_t const*> input_data;
-  std::vector<bitmask_type const*> input_nm;
-  input_data.reserve(num_columns);
-  input_nm.reserve(num_columns);
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv      = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type    = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
-
-  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-  auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
-
-  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
-  std::vector<size_type> column_sizes;  // byte size of each column
-  std::vector<size_type> column_starts;  // offset of column inside a row including alignment
-  std::vector<column_view>
-    variable_width_columns;  // list of the variable width columns in the table
-  row_sizes.reserve(num_rows);
-  row_offsets.reserve(num_rows);
-  column_sizes.reserve(num_columns);
-  column_starts.reserve(num_columns + 1);  // we add a final offset for validity data start
-
-  auto iter = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-      return std::make_tuple(tbl.column(i).type(), tbl.column(i));
-    });
-
-  size_type fixed_width_size_per_row = detail::compute_column_information(iter,
-                                                                          iter + num_columns,
-                                                                          column_starts,
-                                                                          column_sizes);  //,
-  //    [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); });
-  /*  size_type fixed_width_size_per_row = 0;
-    for (int col = 0; col < num_columns; ++col) {
-      auto cv          = tbl.column(col);
-      auto col_type    = cv.type();
-      bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-      if (nested_type) { variable_width_columns.push_back(cv); }
-
-      // a list or string column will write a single uint64
-      // of data here for offset/length
-      auto col_size = nested_type ? 8 : size_of(col_type);
-
-      // align size for this type
-      std::size_t const alignment_needed = col_size;  // They are the same for fixed width types
-      fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-      column_starts.push_back(fixed_width_size_per_row);
-      column_sizes.push_back(col_size);
-      fixed_width_size_per_row += col_size;
-    }*/
-
-#if defined(DEBUG)
-  printf("validity offset will be %d + %d = %d\n",
-         column_starts.back(),
-         column_sizes.back(),
-         column_starts.back() + column_sizes.back());
-#endif
-
-  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
-  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-
-  std::vector<detail::row_batch> row_batches;
-
-  auto calculate_variable_width_row_data_size = [](int const row) {
-    // each level of variable-width data will add an offset/length
-    // uint64 of data. The first of which is inside the fixed-width
-    // data itself and needs to be aligned based on what is around
-    // that data. This is handled above with the fixed-width calculations
-    // for that reason. We may still need to add more of these offset/length
-    // combinations if the nesting is deeper than one level as these
-    // will be included in the variable-width data blob at the end of the
-    // row.
-    return 0;
-    /*      auto c = variable_width_columns[col];
-            while (true) {
-              auto col_offsets   = c.child(0).data<size_type>();
-              auto col_data_size = size_of(c.child(1).type());
-              std::size_t alignment_needed  = col_data_size;
-
-            row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size;
-            if (c.num_children() == 0) {
-              break;
-            }
-            c = c.child(1);
-          }
-          exclusive_scan([t](int row_index) {
-            size_type total_row_size = 0;
-            for (int i=0 i<t.num_columns(); ++i) {
-              // compute data prior to validity
-              data_size += compute_type_size();
-              // compute validity size
-              total_row_size += num_columns() / 8;
-              total_row_size = align(data_size + bit_size + variable_size);
-            }
-          }
-    */
-  };
-
-  uint64_t row_batch_size   = 0;
-  uint64_t total_table_size = 0;
-  size_type row_batch_rows  = 0;
-  uint64_t row_offset       = 0;
-
-  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
-  // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = num_bitmask_words(num_columns) * 4;
-  // thrust
-  for (int row = 0; row < num_rows; ++row) {
-    auto aligned_row_batch_size =
-      detail::align_offset(row_batch_size, 8);  // rows are 8 byte aligned
-    row_sizes[row] = fixed_width_size_per_row;
-    // validity is byte aligned
-    row_sizes[row] += validity_size;
-    // variable width data is 8-byte aligned
-    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
-                     calculate_variable_width_row_data_size(row);  // rows are 8 byte aligned
-
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
-        (uint64_t)std::numeric_limits<size_type>::max()) {
-      // a new batch starts at the last 32-row boundary
-      row_batches.push_back(
-        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size         = 0;
-      row_batch_rows         = row_batch_rows & 31;
-      row_offset             = 0;
-      aligned_row_batch_size = 0;
-    }
-    row_offset = detail::align_offset(row_offset, 8);  // rows are 8 byte aligned
-    row_offsets.push_back(row_offset);
-    row_batch_size = aligned_row_batch_size + row_sizes[row];
-    row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8);  // rows are 8 byte aligned
-    total_table_size += row_sizes[row];
-    row_batch_rows++;
-  }
-  if (row_batch_size > 0) {
-    row_batches.push_back(
-      detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
-  }
-
-  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
-
-#if defined(DEBUG)
-  printf("%d rows and %d columns in table\n", num_rows, num_columns);
-  printf("%lu batches:\n", row_batches.size());
-  for (auto i = 0; i < (int)row_batches.size(); ++i) {
-    printf("%d: %d rows, ", i, row_batches[i].row_count);
-    detail::pretty_print(row_batches[i].num_bytes);
-    printf("\n");
-  }
-#endif
-
-  std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t*> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t*>(temp.data()));
-    output_buffers.push_back(std::move(temp));
-  }
-  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-
-  std::vector<detail::block_info> block_infos =
-    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-#if defined(DEBUG)
-  printf("%lu windows for %d columns, %d rows to fit in ",
-         block_infos.size(),
-         block_infos[0].end_col - block_infos[0].start_col + 1,
-         block_infos[0].end_row - block_infos[0].start_row);
-  detail::pretty_print(shmem_limit_per_block);
-  printf(" shared mem(");
-  detail::pretty_print(fixed_width_size_per_row);
-  printf("/row, %d columns, %d rows, ", num_columns, num_rows);
-  detail::pretty_print(total_table_size);
-  printf(" total):\n");
-#endif
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
-
-  // blast through the entire table and convert it
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS));
-  dim3 threads(256);
-
-#if defined(DEBUG)
-  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  detail::pretty_print(shmem_limit_per_block);
-  printf(" shared memory\n");
-#endif
-  detail::copy_from_columns<<<blocks, threads, total_shmem, stream.value()>>>(
-    num_rows,
-    num_columns,
-    shmem_limit_per_block,
-    block_infos.size(),
-    dev_input_data.data(),
-    dev_col_sizes.data(),
-    dev_col_starts.data(),
-    dev_block_infos.data(),
-    dev_row_offsets.data(),
-    reinterpret_cast<int8_t**>(dev_output_data.data()));
-
-  auto validity_block_infos =
-    build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
-
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
-  dim3 validity_blocks(
-    util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
-  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-#if defined(DEBUG)
-  printf("Launching validity kernel with %d blocks, for %lu validity blocks with %d threads, ",
-         validity_blocks.x,
-         validity_block_infos.size(),
-         validity_threads.x);
-  detail::pretty_print(total_shmem);
-  printf(" shared memory\n");
-#endif
-  detail::
-    copy_validity_from_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-      num_rows,
-      num_columns,
-      shmem_limit_per_block,
-      dev_row_offsets.data(),
-      dev_output_data.data(),
-      column_starts.back(),
-      dev_validity_block_infos.data(),
-      validity_block_infos.size(),
-      dev_input_nm.data());
-
-  // split up the output buffer into multiple buffers based on row batch sizes
-  // and create list of byte columns
-  int offset_offset = 0;
-  std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    // compute offsets for this row batch
-    std::vector<size_type> offset_vals;
-    offset_vals.reserve(row_batches[i].row_count + 1);
-    size_type cur_offset = 0;
-    offset_vals.push_back(cur_offset);
-    for (int row = 0; row < row_batches[i].row_count; ++row) {
-      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
-      offset_vals.push_back(cur_offset);
-    }
-    offset_offset += row_batches[i].row_count;
-
-    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
-    auto offsets     = std::make_unique<column>(
-      data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release());
-
-    auto data = std::make_unique<column>(
-      data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i]));
-
-    ret.push_back(cudf::make_lists_column(row_batches[i].row_count,
-                                          std::move(offsets),
-                                          std::move(data),
-                                          0,
-                                          rmm::device_buffer{0, rmm::cuda_stream_default, mr},
-                                          stream,
-                                          mr));
-  }
-
-  return ret;
-#else
-  CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
-  return {};
-#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-}
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
-  cudf::table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
-{
-  const cudf::size_type num_columns = tbl.num_columns();
-
-  std::vector<cudf::data_type> schema;
-  schema.resize(num_columns);
-  std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
-
-  if (detail::are_all_fixed_width(schema)) {
-    std::vector<cudf::size_type> column_start;
-    std::vector<cudf::size_type> column_size;
-
-    int32_t size_per_row  = detail::compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
-    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
-
-    int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
-    // Make the number of rows per batch a multiple of 32 so we don't have to worry about
-    // splitting validity at a specific row offset.  This might change in the future.
-    max_rows_per_batch = (max_rows_per_batch / 32) * 32;
-
-    cudf::size_type num_rows = tbl.num_rows();
-
-    // Get the pointers to the input columnar data ready
-    std::vector<const int8_t*> input_data;
-    std::vector<cudf::bitmask_type const*> input_nm;
-    for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
-      cudf::column_view cv = tbl.column(column_number);
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
-
-    using ScalarType = cudf::scalar_type_t<cudf::size_type>;
-    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    zero->set_valid_async(true, stream);
-    static_cast<ScalarType*>(zero.get())->set_value(0, stream);
-
-    auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-    step->set_valid_async(true, stream);
-    static_cast<ScalarType*>(step.get())
-      ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
-
-    std::vector<std::unique_ptr<cudf::column>> ret;
-    for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
-      cudf::size_type row_count = num_rows - row_start;
-      row_count                 = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-      ret.emplace_back(detail::fixed_width_convert_to_rows(row_start,
-                                                           row_count,
-                                                           num_columns,
-                                                           size_per_row,
-                                                           dev_column_start,
-                                                           dev_column_size,
-                                                           dev_input_data,
-                                                           dev_input_nm,
-                                                           *zero,
-                                                           *step,
-                                                           stream,
-                                                           mr));
-    }
-
-    return ret;
-  } else {
-    CUDF_FAIL("Only fixed width types are currently supported");
-  }
-}
-
-std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const& input,
-                                               std::vector<cudf::data_type> const& schema,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
-{
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-  // verify that the types are what we expect
-  cudf::column_view child = input.child();
-  cudf::type_id list_type = child.type().id();
-  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-               "Only a list of bytes is supported as input");
-
-  cudf::size_type num_columns = schema.size();
-  cudf::size_type num_rows    = input.parent().size();
-
-  int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
-  int total_shmem;
-  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
-
-#if defined(DEBUG) || 1
-  total_shmem -= 1024;
-#endif
-  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-
-  std::vector<cudf::size_type> column_starts;
-  std::vector<cudf::size_type> column_sizes;
-
-  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
-    return std::make_tuple(schema[i], nullptr);
-  });
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-    iter, iter + num_columns, column_starts, column_sizes);  //, [](void *) {});
-
-  size_type validity_size = num_bitmask_words(num_columns) * 4;
-
-  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
-
-  // Ideally we would check that the offsets are all the same, etc. but for now
-  // this is probably fine
-  CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-  auto dev_col_sizes  = make_device_uvector_async(column_sizes, stream, mr);
-
-  // build the row_batches from the passed in list column
-  std::vector<detail::row_batch> row_batches;
-
-  row_batches.push_back(detail::row_batch{child.size(), num_rows});
-
-  // Allocate the columns we are going to write into
-  std::vector<std::unique_ptr<cudf::column>> output_columns;
-  std::vector<int8_t*> output_data;
-  std::vector<cudf::bitmask_type*> output_nm;
-  for (cudf::size_type i = 0; i < num_columns; i++) {
-    auto column = cudf::make_fixed_width_column(
-      schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
-    auto mut = column->mutable_view();
-    output_data.emplace_back(mut.data<int8_t>());
-    output_nm.emplace_back(mut.null_mask());
-    output_columns.emplace_back(std::move(column));
-  }
-
-  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-  auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
-
-  std::vector<detail::block_info> block_infos =
-    build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
-
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
-#if defined(DEBUG)
-  dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size()));
-#else
-  dim3 threads(std::min(256, (int)child.size()));
-#endif
-#if defined(DEBUG)
-  printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x);
-  detail::pretty_print(total_shmem);
-  printf(" shared memory\n");
-#endif
-  detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
-    num_rows,
-    num_columns,
-    shmem_limit_per_block,
-    input.offsets().data<size_type>(),
-    dev_output_data.data(),
-    dev_col_sizes.data(),
-    dev_col_starts.data(),
-    dev_block_infos.data(),
-    block_infos.size(),
-    child.data<int8_t>());
-
-  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-  auto const column_stride            = [&]() {
-    if (desired_rows_and_columns > num_columns) {
-      // not many columns, group it into 64s and ship it off
-      return std::min(64, num_columns);
-    } else {
-      return util::round_down_safe(desired_rows_and_columns, 8);
-    }
-  }();
-  auto const row_stride = [&]() {
-    // we fit as much as we can, we know the column stride now, so calculate the row
-    return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
-    /*    if (desired_rows_and_columns > num_rows) {
-          return std::min(32, num_rows);
-        } else {
-          return util::round_down_safe(desired_rows_and_columns, 32);
-        }*/
-  }();
-  std::vector<detail::block_info> validity_block_infos;
-  for (int col = 0; col < num_columns; col += column_stride) {
-    for (int row = 0; row < num_rows; row += row_stride) {
-      validity_block_infos.emplace_back(
-        detail::block_info{col,
-                           row,
-                           std::min(col + column_stride - 1, num_columns - 1),
-                           std::min(row + row_stride - 1, num_rows - 1)});
-    }
-  }
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
-  dim3 validity_blocks(
-    util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
-#if defined(DEBUG)
-  printf(
-    "Launching validity kernel with %d blocks, for %lu validity blocks, col stride %d and row "
-    "stride of %d with %d threads, ",
-    validity_blocks.x,
-    validity_block_infos.size(),
-    column_stride,
-    row_stride,
-    threads.x);
-  detail::pretty_print(total_shmem);
-  printf(" shared memory\n");
-#endif
-
-  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-  detail::
-    copy_validity_to_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-      num_rows,
-      num_columns,
-      shmem_limit_per_block,
-      input.offsets().data<size_type>(),
-      dev_output_nm.data(),
-      column_starts.back(),
-      dev_validity_block_infos.data(),
-      validity_block_infos.size(),
-      child.data<int8_t>());
-
-  return std::make_unique<cudf::table>(std::move(output_columns));
-#else
-  CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
-  return {};
-#endif  // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-}
-
-std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
-  cudf::lists_column_view const& input,
-  std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  // verify that the types are what we expect
-  cudf::column_view child = input.child();
-  cudf::type_id list_type = child.type().id();
-  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-               "Only a list of bytes is supported as input");
-
-  cudf::size_type num_columns = schema.size();
-
-  if (detail::are_all_fixed_width(schema)) {
-    std::vector<cudf::size_type> column_start;
-    std::vector<cudf::size_type> column_size;
-
-    cudf::size_type num_rows = input.parent().size();
-    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
-
-    // Ideally we would check that the offsets are all the same, etc. but for now
-    // this is probably fine
-    CUDF_EXPECTS(size_per_row * num_rows == child.size(),
-                 "The layout of the data appears to be off");
-    auto dev_column_start = make_device_uvector_async(column_start, stream);
-    auto dev_column_size  = make_device_uvector_async(column_size, stream);
-
-    // Allocate the columns we are going to write into
-    std::vector<std::unique_ptr<cudf::column>> output_columns;
-    std::vector<int8_t*> output_data;
-    std::vector<cudf::bitmask_type*> output_nm;
-    for (cudf::size_type i = 0; i < num_columns; i++) {
-      auto column = cudf::make_fixed_width_column(
-        schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr);
-      auto mut = column->mutable_view();
-      output_data.emplace_back(mut.data<int8_t>());
-      output_nm.emplace_back(mut.null_mask());
-      output_columns.emplace_back(std::move(column));
-    }
-
-    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
-
-    dim3 blocks;
-    dim3 threads;
-    int shared_size =
-      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
-
-    //    printf("Launching (%d, %d, %d) blocks, (%d, %d, %d) threads, with %d shared size\n",
-    //    blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shared_size);
-    //    printf("pointers are column_start: %p, column_size: %p, output_data: %p, output_nm: %p\n",
-    //    dev_column_start.data(), dev_column_size.data(), dev_output_data.data(),
-    //    dev_output_nm.data());
-    detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-      num_rows,
-      num_columns,
-      size_per_row,
-      dev_column_start.data(),
-      dev_column_size.data(),
-      dev_output_data.data(),
-      dev_output_nm.data(),
-      child.data<int8_t>());
-
-    return std::make_unique<cudf::table>(std::move(output_columns));
-  } else {
-    CUDF_FAIL("Only fixed width types are currently supported");
-  }
-}
-
-}  // namespace cudf
+       current_window_start_col = col;
+       current_window_width = 0;
+     } else {
+       row_size = row_size_with_this_col;
+       current_window_width++;
+     }
+   }
+ 
+   // build last set of blocks
+   if (current_window_width > 0) {
+     build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
+   }
+ 
+   return block_infos;
+ }
+ 
+ #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ 
+ } // namespace detail
+ 
+ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource *mr) {
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
+   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
+   // Potential optimization for window sizes.
+   const size_type num_columns = tbl.num_columns();
+   const size_type num_rows = tbl.num_rows();
+ 
+   int device_id;
+   CUDA_TRY(cudaGetDevice(&device_id));
+   int total_shmem;
+   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+ 
+   // TODO: why?
+   total_shmem -= 1024;
+   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+ 
+   // break up the work into blocks, which are a starting and ending row/col #.
+   // this window size is calculated based on the shared memory size available
+   // we want a single block to fill up the entire shared memory space available
+   // for the transpose-like conversion.
+ 
+   // There are two different processes going on here. The GPU conversion of the data
+   // and the writing of the data into the list of byte columns that are a maximum of
+   // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
+   // this limitation because the column must own the data inside and as a result it must be
+   // a distinct allocation for that column. Copying the data into these final buffers would
+   // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
+   // The windows are broken at the boundaries of specific rows based on the row sizes up
+   // to that point. These are row batches and they are decided first before building the
+   // windows so the windows can be properly cut around them.
+ 
+   // Get the pointers to the input columnar data ready
+   std::vector<int8_t const *> input_data;
+   std::vector<bitmask_type const *> input_nm;
+   input_data.reserve(num_columns);
+   input_nm.reserve(num_columns);
+   for (size_type column_number = 0; column_number < num_columns; column_number++) {
+     column_view cv = tbl.column(column_number);
+     auto const col_type = cv.type();
+     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+ 
+     if (!nested_type) {
+       input_data.emplace_back(cv.data<int8_t>());
+       input_nm.emplace_back(cv.null_mask());
+     }
+   }
+ 
+   auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+   auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
+ 
+   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
+   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
+   std::vector<size_type> column_sizes;  // byte size of each column
+   std::vector<size_type> column_starts; // offset of column inside a row including alignment
+   std::vector<column_view>
+       variable_width_columns; // list of the variable width columns in the table
+   row_sizes.reserve(num_rows);
+   row_offsets.reserve(num_rows);
+   column_sizes.reserve(num_columns);
+   column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
+ 
+   auto iter =
+       thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                       [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+                                         return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+                                       });
+ 
+   size_type fixed_width_size_per_row =
+       detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
+ 
+   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+ 
+   std::vector<detail::row_batch> row_batches;
+ 
+   uint64_t row_batch_size = 0;
+   uint64_t total_table_size = 0;
+   size_type row_batch_rows = 0;
+   uint64_t row_offset = 0;
+ 
+   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
+   // calculate the size of each row's variable-width data and validity as well.
+   auto validity_size = num_bitmask_words(num_columns) * 4;
+   // thrust
+   for (int row = 0; row < num_rows; ++row) {
+     auto aligned_row_batch_size =
+         detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
+     row_sizes[row] = fixed_width_size_per_row;
+     // validity is byte aligned
+     row_sizes[row] += validity_size;
+     // variable width data is 8-byte aligned
+     row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned
+ 
+     if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
+         (uint64_t)std::numeric_limits<size_type>::max()) {
+       // a new batch starts at the last 32-row boundary
+       row_batches.push_back(
+           detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
+       row_batch_size = 0;
+       row_batch_rows = row_batch_rows & 31;
+       row_offset = 0;
+       aligned_row_batch_size = 0;
+     }
+     row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
+     row_offsets.push_back(row_offset);
+     row_batch_size = aligned_row_batch_size + row_sizes[row];
+     row_offset += row_sizes[row];
+     total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
+     total_table_size += row_sizes[row];
+     row_batch_rows++;
+   }
+   if (row_batch_size > 0) {
+     row_batches.push_back(
+         detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
+   }
+ 
+   auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
+ 
+   std::vector<rmm::device_buffer> output_buffers;
+   std::vector<int8_t *> output_data;
+   output_data.reserve(row_batches.size());
+   for (uint i = 0; i < row_batches.size(); ++i) {
+     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+     output_data.push_back(static_cast<int8_t *>(temp.data()));
+     output_buffers.push_back(std::move(temp));
+   }
+   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+ 
+   std::vector<detail::block_info> block_infos =
+       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+ 
+   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+ 
+   // blast through the entire table and convert it
+   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
+   dim3 threads(256);
+ 
+   detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
+       num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
+       dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
+       reinterpret_cast<int8_t **>(dev_output_data.data()));
+ 
+   auto validity_block_infos =
+       build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+ 
+   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+   dim3 validity_blocks(
+       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
+   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+   detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem,
+                                        stream.value()>>>(
+       num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
+       column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
+       dev_input_nm.data());
+ 
+   // split up the output buffer into multiple buffers based on row batch sizes
+   // and create list of byte columns
+   int offset_offset = 0;
+   std::vector<std::unique_ptr<cudf::column>> ret;
+   for (uint i = 0; i < row_batches.size(); ++i) {
+     // compute offsets for this row batch
+     std::vector<size_type> offset_vals;
+     offset_vals.reserve(row_batches[i].row_count + 1);
+     size_type cur_offset = 0;
+     offset_vals.push_back(cur_offset);
+     for (int row = 0; row < row_batches[i].row_count; ++row) {
+       cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
+       offset_vals.push_back(cur_offset);
+     }
+     offset_offset += row_batches[i].row_count;
+ 
+     auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
+     auto offsets = std::make_unique<column>(data_type{type_id::INT32},
+                                             (size_type)offset_vals.size(), dev_offsets.release());
+ 
+     auto data = std::make_unique<column>(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes,
+                                          std::move(output_buffers[i]));
+ 
+     ret.push_back(
+         cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0,
+                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
+   }
+ 
+   return ret;
+ #else
+   CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
+   return {};
+ #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ }
+ 
+ std::vector<std::unique_ptr<cudf::column>>
+ convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource *mr) {
+   const cudf::size_type num_columns = tbl.num_columns();
+ 
+   std::vector<cudf::data_type> schema;
+   schema.resize(num_columns);
+   std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
+ 
+   if (detail::are_all_fixed_width(schema)) {
+     std::vector<cudf::size_type> column_start;
+     std::vector<cudf::size_type> column_size;
+ 
+     int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+     auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
+     auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
+ 
+     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
+     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
+     // splitting validity at a specific row offset.  This might change in the future.
+     max_rows_per_batch = (max_rows_per_batch / 32) * 32;
+ 
+     cudf::size_type num_rows = tbl.num_rows();
+ 
+     // Get the pointers to the input columnar data ready
+     std::vector<const int8_t *> input_data;
+     std::vector<cudf::bitmask_type const *> input_nm;
+     for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
+       cudf::column_view cv = tbl.column(column_number);
+       input_data.emplace_back(cv.data<int8_t>());
+       input_nm.emplace_back(cv.null_mask());
+     }
+     auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+     auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
+ 
+     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
+     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+     zero->set_valid_async(true, stream);
+     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
+ 
+     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+     step->set_valid_async(true, stream);
+     static_cast<ScalarType *>(step.get())
+         ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
+ 
+     std::vector<std::unique_ptr<cudf::column>> ret;
+     for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
+       cudf::size_type row_count = num_rows - row_start;
+       row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
+       ret.emplace_back(detail::fixed_width_convert_to_rows(
+           row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
+           dev_input_data, dev_input_nm, *zero, *step, stream, mr));
+     }
+ 
+     return ret;
+   } else {
+     CUDF_FAIL("Only fixed width types are currently supported");
+   }
+ }
+ 
+ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
+                                                std::vector<cudf::data_type> const &schema,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource *mr) {
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+   // verify that the types are what we expect
+   cudf::column_view child = input.child();
+   cudf::type_id list_type = child.type().id();
+   CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+                "Only a list of bytes is supported as input");
+ 
+   cudf::size_type num_columns = schema.size();
+   cudf::size_type num_rows = input.parent().size();
+ 
+   int device_id;
+   CUDA_TRY(cudaGetDevice(&device_id));
+   int total_shmem;
+   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+ 
+   // TODO why?
+   total_shmem -= 1024;
+   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+ 
+   std::vector<cudf::size_type> column_starts;
+   std::vector<cudf::size_type> column_sizes;
+ 
+   auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
+     return std::make_tuple(schema[i], nullptr);
+   });
+   size_type fixed_width_size_per_row = detail::compute_column_information(
+       iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {});
+ 
+   size_type validity_size = num_bitmask_words(num_columns) * 4;
+ 
+   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
+ 
+   // Ideally we would check that the offsets are all the same, etc. but for now
+   // this is probably fine
+   CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
+   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+ 
+   // build the row_batches from the passed in list column
+   std::vector<detail::row_batch> row_batches;
+ 
+   row_batches.push_back(detail::row_batch{child.size(), num_rows});
+ 
+   // Allocate the columns we are going to write into
+   std::vector<std::unique_ptr<cudf::column>> output_columns;
+   std::vector<int8_t *> output_data;
+   std::vector<cudf::bitmask_type *> output_nm;
+   for (cudf::size_type i = 0; i < num_columns; i++) {
+     auto column = cudf::make_fixed_width_column(schema[i], num_rows,
+                                                 cudf::mask_state::UNINITIALIZED, stream, mr);
+     auto mut = column->mutable_view();
+     output_data.emplace_back(mut.data<int8_t>());
+     output_nm.emplace_back(mut.null_mask());
+     output_columns.emplace_back(std::move(column));
+   }
+ 
+   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+   auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
+ 
+   std::vector<detail::block_info> block_infos =
+       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
+ 
+   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+ 
+   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
+   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
+   detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
+       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+       dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
+       block_infos.size(), child.data<int8_t>());
+ 
+   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+   auto const column_stride = [&]() {
+     if (desired_rows_and_columns > num_columns) {
+       // not many columns, group it into 64s and ship it off
+       return std::min(64, num_columns);
+     } else {
+       return util::round_down_safe(desired_rows_and_columns, 8);
+     }
+   }();
+   auto const row_stride = [&]() {
+     // we fit as much as we can, we know the column stride now, so calculate the row
+     return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
+     /*    if (desired_rows_and_columns > num_rows) {
+           return std::min(32, num_rows);
+         } else {
+           return util::round_down_safe(desired_rows_and_columns, 32);
+         }*/
+   }();
+   std::vector<detail::block_info> validity_block_infos;
+   for (int col = 0; col < num_columns; col += column_stride) {
+     for (int row = 0; row < num_rows; row += row_stride) {
+       validity_block_infos.emplace_back(
+           detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
+                              std::min(row + row_stride - 1, num_rows - 1)});
+     }
+   }
+   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+   dim3 validity_blocks(
+       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
+ 
+   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+   detail::
+       copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+           dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
+           validity_block_infos.size(), child.data<int8_t>());
+ 
+   return std::make_unique<cudf::table>(std::move(output_columns));
+ #else
+   CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
+   return {};
+ #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+ }
+ 
+ std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+     cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
+     rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+   // verify that the types are what we expect
+   cudf::column_view child = input.child();
+   cudf::type_id list_type = child.type().id();
+   CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+                "Only a list of bytes is supported as input");
+ 
+   cudf::size_type num_columns = schema.size();
+ 
+   if (detail::are_all_fixed_width(schema)) {
+     std::vector<cudf::size_type> column_start;
+     std::vector<cudf::size_type> column_size;
+ 
+     cudf::size_type num_rows = input.parent().size();
+     int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+ 
+     // Ideally we would check that the offsets are all the same, etc. but for now
+     // this is probably fine
+     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
+                  "The layout of the data appears to be off");
+     auto dev_column_start = make_device_uvector_async(column_start, stream);
+     auto dev_column_size = make_device_uvector_async(column_size, stream);
+ 
+     // Allocate the columns we are going to write into
+     std::vector<std::unique_ptr<cudf::column>> output_columns;
+     std::vector<int8_t *> output_data;
+     std::vector<cudf::bitmask_type *> output_nm;
+     for (cudf::size_type i = 0; i < num_columns; i++) {
+       auto column = cudf::make_fixed_width_column(schema[i], num_rows,
+                                                   cudf::mask_state::UNINITIALIZED, stream, mr);
+       auto mut = column->mutable_view();
+       output_data.emplace_back(mut.data<int8_t>());
+       output_nm.emplace_back(mut.null_mask());
+       output_columns.emplace_back(std::move(column));
+     }
+ 
+     auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+     auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
+ 
+     dim3 blocks;
+     dim3 threads;
+     int shared_size =
+         detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+ 
+     detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
+         num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
+         dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
+ 
+     return std::make_unique<cudf::table>(std::move(output_columns));
+   } else {
+     CUDF_FAIL("Only fixed width types are currently supported");
+   }
+ }
+ 
+ } // namespace cudf
+ 
\ No newline at end of file
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index a67589fbaec..932afa4bb70 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -50,8 +50,8 @@
 #include <thrust/iterator/transform_iterator.h>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8;
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
@@ -67,13 +67,11 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size
   return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
-                                            const cudf::size_type num_columns,
-                                            const cudf::size_type row_size,
-                                            const cudf::size_type *input_offset_in_row,
-                                            const cudf::size_type *num_bytes, int8_t **output_data,
-                                            cudf::bitmask_type **output_nm,
-                                            const int8_t *input_data) {
+__global__ void copy_from_rows_fixed_width_optimized(
+    const cudf::size_type num_rows, const cudf::size_type num_columns,
+    const cudf::size_type row_size, const cudf::size_type *input_offset_in_row,
+    const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm,
+    const int8_t *input_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -190,12 +188,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
   }
 }
 
-__global__ void
-copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_type num_rows,
-                              const cudf::size_type num_columns, const cudf::size_type row_size,
-                              const cudf::size_type *output_offset_in_row,
-                              const cudf::size_type *num_bytes, const int8_t **input_data,
-                              const cudf::bitmask_type **input_nm, int8_t *output_data) {
+__global__ void copy_to_rows_fixed_width_optimized(
+    const cudf::size_type start_row, const cudf::size_type num_rows,
+    const cudf::size_type num_columns, const cudf::size_type row_size,
+    const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes,
+    const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -367,12 +364,11 @@ struct row_batch {
  * @param output_data pointer to output data
  *
  */
-__global__ void copy_from_columns(const size_type num_rows, const size_type num_columns,
-                                  const size_type shmem_used_per_block,
-                                  const size_type num_block_infos, const int8_t **input_data,
-                                  const size_type *col_sizes, const size_type *col_offsets,
-                                  const block_info *block_infos, const size_type *row_offsets,
-                                  int8_t **output_data) {
+__global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
+                             const size_type shmem_used_per_block, const size_type num_block_infos,
+                             const int8_t **input_data, const size_type *col_sizes,
+                             const size_type *col_offsets, const block_info *block_infos,
+                             const size_type *row_offsets, int8_t **output_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -396,15 +392,15 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
   group.sync();
 
   auto const blocks_remaining =
-      std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS,
-               (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS);
+      std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
+               (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS);
 
   size_t fetch;
   size_t subset;
   for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
     // Fetch ahead up to stages_count subsets
     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
-      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch];
+      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch];
       auto const num_fetch_cols = fetch_block.num_cols();
       auto const num_fetch_rows = fetch_block.num_rows();
       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
@@ -462,7 +458,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
     auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     subset_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset];
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
     auto const block_output_buffer = output_data[block.buffer_num];
@@ -499,7 +495,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_validity_from_columns(
+__global__ void copy_validity_to_rows(
     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
     const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
     const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
@@ -633,74 +629,6 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num
   return {col_size_bytes, col_offset_bytes};
 }
 
-/**
- * @brief ensure `read_ahead` buffer blocks are fetched
- *
- * @param fetch_index internal state passed into the function
- * @param processing_index index where processing is occuring
- * @param read_ahead_count how many blocks to read ahead
- * @param max_resident_blocks how many blocks can be loaded at once
- * @param total_blocks total number of blocks overall
- * @param block_infos pointer to the block infos
- * @param col_sizes pointer to column size information
- * @param col_offsets pointer to the table's column offsets
- * @param row_offsets pointer to offsets for each row in the table
- * @param input_data pointer to the input data
- * @param shared pointer to shared memory
- * @param group thread group participating in the fetch
- * @param block_barrier barriers used for each block
- * @return
- */
-static __device__ void
-fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_index,
-                               int const read_ahead_count, int const max_resident_blocks,
-                               int const total_blocks, block_info const *const block_infos,
-                               size_type const *const col_sizes, size_type const *const col_offsets,
-                               size_type const *const row_offsets, int8_t const *const input_data,
-                               int8_t *shared[], cooperative_groups::thread_block const group,
-                               cuda::barrier<cuda::thread_scope_block> *block_barrier) {
-  for (; fetch_index < static_cast<size_t>(total_blocks) &&
-         fetch_index < (processing_index + read_ahead_count);
-       ++fetch_index) {
-    auto const fetch_block =
-        block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index];
-    auto const fetch_block_start_row = fetch_block.start_row;
-    auto const fetch_block_end_row = fetch_block.end_row;
-    auto const starting_col_offset = col_offsets[fetch_block.start_col];
-    auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-    auto const num_fetch_cols = fetch_block.num_cols();
-    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-        sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols);
-    auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
-
-    // if we have fetched all buffers, we need to wait for processing
-    // to complete on them before we can use them again
-    if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
-      fetch_barrier.arrive_and_wait();
-    }
-
-    auto shared_row_offset = 0;
-    // copy the data for column sizes
-    cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset],
-                       &col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
-    shared_row_offset += col_size_bytes;
-    // copy the data for column offsets
-    cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset],
-                       &col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
-    shared_row_offset += col_offset_bytes;
-    shared_row_offset = align_offset(shared_row_offset, 8);
-
-    for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
-         row <= fetch_block_end_row; row += blockDim.x) {
-      auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
-      // copy the main
-      cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset],
-                         &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size,
-                         fetch_barrier);
-    }
-  }
-}
-
 /**
  * @brief copy data from row-based format to cudf columns
  *
@@ -716,7 +644,7 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_to_columns(const size_type num_rows, const size_type num_columns,
+__global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
                                 const size_type shmem_used_per_block, const size_type *row_offsets,
                                 int8_t **output_data, const size_type *_col_sizes,
                                 const size_type *_col_offsets, const block_info *block_infos,
@@ -746,40 +674,70 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
 
   group.sync();
 
-  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS,
-                                   (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS);
+  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
+                                   (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
+
+  size_t fetch_index;
+  size_t processing_index;
+  for (processing_index = fetch_index = 0; processing_index < blocks_remaining;
+       ++processing_index) {
+    // Fetch ahead up to stages_count groups
+    for (; fetch_index < static_cast<size_t>(blocks_remaining) &&
+           fetch_index < (processing_index + stages_count);
+         ++fetch_index) {
+      auto const fetch_block =
+          block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index];
+      auto const fetch_block_start_row = fetch_block.start_row;
+      auto const fetch_block_end_row = fetch_block.end_row;
+      auto const starting_col_offset = _col_offsets[fetch_block.start_col];
+      auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes);
+      auto const num_fetch_cols = fetch_block.num_cols();
+      auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+          sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols);
+      auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
 
-  auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)),
-                               col_offset_size = sizeof(decltype(*_col_offsets))](
-                                  int const num_cols,
-                                  int const num_rows) -> std::tuple<size_type, size_type> {
-    auto const col_size_bytes = num_cols * col_size_size;
-    auto const col_offset_bytes = num_cols * col_offset_size;
+      // if we have fetched all buffers, we need to wait for processing
+      // to complete on them before we can use them again
+      if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
+        fetch_barrier.arrive_and_wait();
+      }
 
-    return {col_size_bytes, col_offset_bytes};
-  };
+      auto shared_row_offset = 0;
+      // copy the data for column sizes
+      cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
+                         &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
+      shared_row_offset += col_size_bytes;
+      // copy the data for column offsets
+      cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
+                         &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
+      shared_row_offset += col_offset_bytes;
+      shared_row_offset = align_offset(shared_row_offset, 8);
+
+      for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
+           row <= fetch_block_end_row; row += blockDim.x) {
+        auto shared_offset =
+            (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
+        // copy the main
+        cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset],
+                           &input_data[row_offsets[row] + starting_col_offset],
+                           fetch_block_row_size, fetch_barrier);
+      }
+    }
 
-  size_t fetch;
-  size_t subset;
-  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
-    // Fetch ahead up to stages_count subsets
-    fetch_blocks_for_row_to_column(fetch, subset, stages_count, stages_count, blocks_remaining,
-                                   block_infos, _col_sizes, _col_offsets, row_offsets, input_data,
-                                   shared, group, block_barrier);
+    auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED];
 
-    auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     // ensure our data is ready
-    subset_barrier.arrive_and_wait();
+    processing_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset];
+    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index];
     auto const rows_in_block = block.num_rows();
     auto const cols_in_block = block.num_cols();
 
-    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block);
-    // auto shared_row_offsets = shared[subset];
-    auto shared_col_sizes = reinterpret_cast<size_type *>(shared[subset % stages_count]);
+    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
+        sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block);
+    auto shared_col_sizes = reinterpret_cast<size_type *>(shared[processing_index % stages_count]);
     auto shared_col_offsets =
-        reinterpret_cast<size_type *>(&shared[subset % stages_count][col_size_bytes]);
+        reinterpret_cast<size_type *>(&shared[processing_index % stages_count][col_size_bytes]);
 
     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
 
@@ -803,10 +761,10 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
                                         shared_memory_row_offset + shared_row_offset;
       auto const column_size = shared_col_sizes[relative_col];
 
-      int8_t *shmem_src = &shared[subset % stages_count][shared_memory_offset];
+      int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset];
       int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
 
-      cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier);
+      cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier);
     }
     group.sync();
   }
@@ -831,7 +789,7 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_validity_to_columns(
+__global__ void copy_validity_from_rows(
     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
     const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
     const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
@@ -1050,7 +1008,7 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty
   int shared_size =
       detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
-  copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+  copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
       start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
       input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
 
@@ -1354,18 +1312,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
   std::vector<detail::row_batch> row_batches;
 
-  auto calculate_variable_width_row_data_size = [](int const row) {
-    // each level of variable-width data will add an offset/length
-    // uint64 of data. The first of which is inside the fixed-width
-    // data itself and needs to be aligned based on what is around
-    // that data. This is handled above with the fixed-width calculations
-    // for that reason. We may still need to add more of these offset/length
-    // combinations if the nesting is deeper than one level as these
-    // will be included in the variable-width data blob at the end of the
-    // row.
-    return 0;
-  };
-
   uint64_t row_batch_size = 0;
   uint64_t total_table_size = 0;
   size_type row_batch_rows = 0;
@@ -1382,8 +1328,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
     // validity is byte aligned
     row_sizes[row] += validity_size;
     // variable width data is 8-byte aligned
-    row_sizes[row] = detail::align_offset(row_sizes[row], 8) +
-                     calculate_variable_width_row_data_size(row); // rows are 8 byte aligned
+    row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned
 
     if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
         (uint64_t)std::numeric_limits<size_type>::max()) {
@@ -1426,10 +1371,10 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
   // blast through the entire table and convert it
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS));
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
   dim3 threads(256);
 
-  detail::copy_from_columns<<<blocks, threads, total_shmem, stream.value()>>>(
+  detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
       dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
       reinterpret_cast<int8_t **>(dev_output_data.data()));
@@ -1439,9 +1384,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-  detail::copy_validity_from_columns<<<validity_blocks, validity_threads, total_shmem,
+  detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem,
                                        stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
       column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
@@ -1610,9 +1555,9 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 
   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
 
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
-  detail::copy_to_columns<<<blocks, threads, total_shmem, stream.value()>>>(
+  detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
       dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
       block_infos.size(), child.data<int8_t>());
@@ -1645,11 +1590,11 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   }
   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS));
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
 
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
   detail::
-      copy_validity_to_columns<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
+      copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
           dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
           validity_block_infos.size(), child.data<int8_t>());
@@ -1707,7 +1652,7 @@ std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
     int shared_size =
         detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
-    detail::copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+    detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
         num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
         dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
 

From 2c4e12fcc6f76e21cd1d6b0ca3f44ceb9ce251e4 Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Thu, 21 Oct 2021 14:49:26 -0700
Subject: [PATCH 61/80] fixed typo

---
 java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
index 9541d05ce00..e4106574a19 100644
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
@@ -393,7 +393,7 @@ public final void setInts(long offset, int[] data, long srcOffset, long len) {
    */
   public final long getLong(long offset) {
     long requestedAddress = this.address + offset;
-    addressOutOfBoundsCheck(requestedAddress, 8, "setLong");
+    addressOutOfBoundsCheck(requestedAddress, 8, "getLong");
     return UnsafeMemoryAccessor.getLong(requestedAddress);
   }
 
@@ -404,7 +404,7 @@ public final long getLong(long offset) {
    */
   public final void setLong(long offset, long value) {
     long requestedAddress = this.address + offset;
-    addressOutOfBoundsCheck(requestedAddress, 8, "getLong");
+    addressOutOfBoundsCheck(requestedAddress, 8, "setLong");
     UnsafeMemoryAccessor.setLong(requestedAddress, value);
   }
 

From fa4f0d3d7e9d8e6829a0a807dbc3eab053fff3d3 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Sat, 23 Oct 2021 01:21:15 +0000
Subject: [PATCH 62/80] Updating for actual PR. Fixed a few last minute bugs,
 removed cudf-land code that was there for testing and benchmarking.

---
 cpp/benchmarks/CMakeLists.txt                 |   35 -
 .../row_conversion/row_conversion.cpp         |  181 --
 cpp/src/row_conversion/row_conversion.cu      | 1666 -----------------
 cpp/tests/row_conversion/row_conversion.cpp   |  677 -------
 java/src/main/native/src/row_conversion.cu    |   33 +-
 5 files changed, 16 insertions(+), 2576 deletions(-)
 delete mode 100644 cpp/benchmarks/row_conversion/row_conversion.cpp
 delete mode 100644 cpp/src/row_conversion/row_conversion.cu
 delete mode 100644 cpp/tests/row_conversion/row_conversion.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 79783f0e512..fa1e61e26fd 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -110,21 +110,10 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask_ben
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchmark.cpp)
 
-<<<<<<< HEAD
 # ##################################################################################################
 # * join benchmark --------------------------------------------------------------------------------
 ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
 ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
-=======
-###################################################################################################
-# - join benchmark --------------------------------------------------------------------------------
-<<<<<<< HEAD
-ConfigureBench(JOIN_BENCH join/join_benchmark.cu)
->>>>>>> working on row and column conversions
-=======
-ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
-ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
->>>>>>> Fixing merge issue
 
 # ##################################################################################################
 # * iterator benchmark ----------------------------------------------------------------------------
@@ -215,7 +204,6 @@ ConfigureBench(CSV_WRITER_BENCH io/csv/csv_writer_benchmark.cpp)
 # * ast benchmark ---------------------------------------------------------------------------------
 ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 
-<<<<<<< HEAD
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
 ConfigureBench(
@@ -227,18 +215,6 @@ ConfigureBench(
 # * nvtext benchmark -------------------------------------------------------------------
 ConfigureBench(
   TEXT_BENCH
-=======
-###################################################################################################
-# - binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH
-  binaryop/binaryop_benchmark.cpp
-  binaryop/compiled_binaryop_benchmark.cpp
-  binaryop/jit_binaryop_benchmark.cpp)
-
-###################################################################################################
-# - nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH
->>>>>>> Fixing merge issue
   text/ngrams_benchmark.cpp
   text/normalize_benchmark.cpp
   text/normalize_spaces_benchmark.cpp
@@ -272,7 +248,6 @@ ConfigureBench(
   string/url_decode_benchmark.cpp
 )
 
-<<<<<<< HEAD
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
 ConfigureBench(JSON_BENCH string/json_benchmark.cpp)
@@ -280,13 +255,3 @@ ConfigureBench(JSON_BENCH string/json_benchmark.cpp)
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp)
-=======
-###################################################################################################
-# - io benchmark ---------------------------------------------------------------------
-ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
-  io/text/multibyte_split_benchmark.cpp)
-
-###################################################################################################
-# - row conversion benchmark ---------------------------------------------------------
-ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp)
->>>>>>> working on row and column conversions
diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp
deleted file mode 100644
index fb8e4c8aef3..00000000000
--- a/cpp/benchmarks/row_conversion/row_conversion.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/row_conversion.hpp>
-#include <cudf_test/column_utilities.hpp>
-
-class RowConversion : public cudf::benchmark {
-};
-
-static void BM_old_to_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         212,
-                                         row_count{n_rows});
-
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    total_bytes += cudf::size_of(t);
-  }
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-static void BM_new_to_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         212,
-                                         row_count{n_rows});
-
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    total_bytes += cudf::size_of(t);
-  }
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto new_rows = cudf::convert_to_rows(table->view());
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-static void BM_old_from_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         256,
-                                         row_count{n_rows});
-
-  std::vector<cudf::data_type> schema;
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    schema.push_back(t);
-    total_bytes += cudf::size_of(t);
-  }
-
-  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
-  cudf::lists_column_view const first_list(rows.front()->view());
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema);
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-static void BM_new_from_row(benchmark::State& state)
-{
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::INT8,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::INT16,
-                                          cudf::type_id::INT64,
-                                          cudf::type_id::INT32,
-                                          cudf::type_id::BOOL8,
-                                          cudf::type_id::UINT16,
-                                          cudf::type_id::UINT8,
-                                          cudf::type_id::UINT64},
-                                         256,
-                                         row_count{n_rows});
-
-  std::vector<cudf::data_type> schema;
-  cudf::size_type total_bytes = 0;
-  for (int i = 0; i < table->num_columns(); ++i) {
-    auto t = table->get_column(i).type();
-    schema.push_back(t);
-    total_bytes += cudf::size_of(t);
-  }
-
-  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
-  cudf::lists_column_view const first_list(rows.front()->view());
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto out = cudf::convert_from_rows(first_list, schema);
-  }
-
-  state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows());
-}
-
-#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)           \
-  (::benchmark::State & st) { f(st); }              \
-  BENCHMARK_REGISTER_F(RowConversion, name)         \
-    ->RangeMultiplier(8)                            \
-    ->Ranges({{1 << 6, 1 << 20}})                   \
-    ->UseManualTime()                               \
-    ->Unit(benchmark::kMillisecond);
-
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row)
-TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row)
-
-#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \
-  BENCHMARK_DEFINE_F(RowConversion, name)             \
-  (::benchmark::State & st) { f(st); }                \
-  BENCHMARK_REGISTER_F(RowConversion, name)           \
-    ->RangeMultiplier(8)                              \
-    ->Ranges({{1 << 6, 1 << 20}})                     \
-    ->UseManualTime()                                 \
-    ->Unit(benchmark::kMillisecond);
-
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row)
-FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row)
diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu
deleted file mode 100644
index c068a2c0b76..00000000000
--- a/cpp/src/row_conversion/row_conversion.cu
+++ /dev/null
@@ -1,1666 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #include <algorithm>
- #include <iostream>
- #include <iterator>
- #include <limits>
- #include <tuple>
- 
- #include <cooperative_groups.h>
- #include <type_traits>
- 
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- #include <cuda/barrier>
- #endif
- 
- #include <cudf/column/column_factories.hpp>
- #include <cudf/detail/iterator.cuh>
- #include <cudf/detail/sequence.hpp>
- #include <cudf/detail/utilities/cuda.cuh>
- #include <cudf/detail/utilities/integer_utils.hpp>
- #include <cudf/detail/utilities/vector_factories.hpp>
- #include <cudf/lists/lists_column_device_view.cuh>
- #include <cudf/row_conversion.hpp>
- #include <cudf/scalar/scalar_factories.hpp>
- #include <cudf/table/table.hpp>
- #include <cudf/types.hpp>
- #include <cudf/utilities/bit.hpp>
- #include <cudf/utilities/error.hpp>
- #include <cudf/utilities/traits.hpp>
- #include <rmm/cuda_stream_view.hpp>
- #include <rmm/device_buffer.hpp>
- #include <rmm/device_uvector.hpp>
- #include <rmm/exec_policy.hpp>
- #include <thrust/binary_search.h>
- #include <thrust/iterator/counting_iterator.h>
- #include <thrust/iterator/transform_iterator.h>
- 
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8;
- constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
- constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
- constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
- constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
- #endif
- 
- using cudf::detail::make_device_uvector_async;
- using rmm::device_uvector;
- namespace cudf {
- 
- namespace detail {
- 
- static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) {
-   return (offset + alignment - 1) & ~(alignment - 1);
- }
- 
- __global__ void copy_from_rows_fixed_width_optimized(
-     const cudf::size_type num_rows, const cudf::size_type num_columns,
-     const cudf::size_type row_size, const cudf::size_type *input_offset_in_row,
-     const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm,
-     const int8_t *input_data) {
-   // We are going to copy the data in two passes.
-   // The first pass copies a chunk of data into shared memory.
-   // The second pass copies that chunk from shared memory out to the final location.
- 
-   // Because shared memory is limited we copy a subset of the rows at a time.
-   // For simplicity we will refer to this as a row_group
- 
-   // In practice we have found writing more than 4 columns of data per thread
-   // results in performance loss. As such we are using a 2 dimensional
-   // kernel in terms of threads, but not in terms of blocks. Columns are
-   // controlled by the y dimension (there is no y dimension in blocks). Rows
-   // are controlled by the x dimension (there are multiple blocks in the x
-   // dimension).
- 
-   cudf::size_type rows_per_group = blockDim.x;
-   cudf::size_type row_group_start = blockIdx.x;
-   cudf::size_type row_group_stride = gridDim.x;
-   cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
- 
-   extern __shared__ int8_t shared_data[];
- 
-   // Because we are copying fixed width only data and we stride the rows
-   // this thread will always start copying from shared data in the same place
-   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-   int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
- 
-   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
-        row_group_index += row_group_stride) {
-     // Step 1: Copy the data into shared memory
-     // We know row_size is always aligned with and a multiple of int64_t;
-     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-     const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
- 
-     cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
-     cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
-     cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-     if (row_index_end > num_rows) {
-       row_index_end = num_rows;
-     }
-     cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-     cudf::size_type shared_length = row_size * num_rows_in_group;
- 
-     cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
- 
-     cudf::size_type start_input_index =
-         (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
- 
-     for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
-          shared_index += shared_output_stride) {
-       long_shared[shared_index] = long_input[start_input_index + shared_index];
-     }
-     // Wait for all of the data to be in shared memory
-     __syncthreads();
- 
-     // Step 2 copy the data back out
- 
-     // Within the row group there should be 1 thread for each row.  This is a
-     // requirement for launching the kernel
-     cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
-     // But we might not use all of the threads if the number of rows does not go
-     // evenly into the thread count. We don't want those threads to exit yet
-     // because we may need them to copy data in for the next row group.
-     uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
-     if (row_index < num_rows) {
-       cudf::size_type col_index_start = threadIdx.y;
-       cudf::size_type col_index_stride = blockDim.y;
-       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
-            col_index += col_index_stride) {
-         cudf::size_type col_size = num_bytes[col_index];
-         const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
-         int8_t *col_output = output_data[col_index];
-         switch (col_size) {
-           case 1: {
-             col_output[row_index] = *col_tmp;
-             break;
-           }
-           case 2: {
-             int16_t *short_col_output = reinterpret_cast<int16_t *>(col_output);
-             short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
-             break;
-           }
-           case 4: {
-             int32_t *int_col_output = reinterpret_cast<int32_t *>(col_output);
-             int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
-             break;
-           }
-           case 8: {
-             int64_t *long_col_output = reinterpret_cast<int64_t *>(col_output);
-             long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
-             break;
-           }
-           default: {
-             cudf::size_type output_offset = col_size * row_index;
-             // TODO this should just not be supported for fixed width columns, but just in case...
-             for (cudf::size_type b = 0; b < col_size; b++) {
-               col_output[b + output_offset] = col_tmp[b];
-             }
-             break;
-           }
-         }
- 
-         cudf::bitmask_type *nm = output_nm[col_index];
-         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
-         cudf::size_type byte_bit_offset = col_index % 8;
-         int predicate = *valid_byte & (1 << byte_bit_offset);
-         uint32_t bitmask = __ballot_sync(active_mask, predicate);
-         if (row_index % 32 == 0) {
-           nm[word_index(row_index)] = bitmask;
-         }
-       } // end column loop
-     }   // end row copy
-     // wait for the row_group to be totally copied before starting on the next row group
-     __syncthreads();
-   }
- }
- 
- __global__ void copy_to_rows_fixed_width_optimized(
-     const cudf::size_type start_row, const cudf::size_type num_rows,
-     const cudf::size_type num_columns, const cudf::size_type row_size,
-     const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes,
-     const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) {
-   // We are going to copy the data in two passes.
-   // The first pass copies a chunk of data into shared memory.
-   // The second pass copies that chunk from shared memory out to the final location.
- 
-   // Because shared memory is limited we copy a subset of the rows at a time.
-   // We do not support copying a subset of the columns in a row yet, so we don't
-   // currently support a row that is wider than shared memory.
-   // For simplicity we will refer to this as a row_group
- 
-   // In practice we have found reading more than 4 columns of data per thread
-   // results in performance loss. As such we are using a 2 dimensional
-   // kernel in terms of threads, but not in terms of blocks. Columns are
-   // controlled by the y dimension (there is no y dimension in blocks). Rows
-   // are controlled by the x dimension (there are multiple blocks in the x
-   // dimension).
- 
-   cudf::size_type rows_per_group = blockDim.x;
-   cudf::size_type row_group_start = blockIdx.x;
-   cudf::size_type row_group_stride = gridDim.x;
-   cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
- 
-   extern __shared__ int8_t shared_data[];
- 
-   // Because we are copying fixed width only data and we stride the rows
-   // this thread will always start copying to shared data in the same place
-   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-   int8_t *row_vld_tmp =
-       &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
- 
-   for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
-        row_group_index += row_group_stride) {
-     // Within the row group there should be 1 thread for each row.  This is a
-     // requirement for launching the kernel
-     cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
-     // But we might not use all of the threads if the number of rows does not go
-     // evenly into the thread count. We don't want those threads to exit yet
-     // because we may need them to copy data back out.
-     if (row_index < (start_row + num_rows)) {
-       cudf::size_type col_index_start = threadIdx.y;
-       cudf::size_type col_index_stride = blockDim.y;
-       for (cudf::size_type col_index = col_index_start; col_index < num_columns;
-            col_index += col_index_stride) {
-         cudf::size_type col_size = num_bytes[col_index];
-         int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
-         const int8_t *col_input = input_data[col_index];
-         switch (col_size) {
-           case 1: {
-             *col_tmp = col_input[row_index];
-             break;
-           }
-           case 2: {
-             const int16_t *short_col_input = reinterpret_cast<const int16_t *>(col_input);
-             *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
-             break;
-           }
-           case 4: {
-             const int32_t *int_col_input = reinterpret_cast<const int32_t *>(col_input);
-             *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
-             break;
-           }
-           case 8: {
-             const int64_t *long_col_input = reinterpret_cast<const int64_t *>(col_input);
-             *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
-             break;
-           }
-           default: {
-             cudf::size_type input_offset = col_size * row_index;
-             // TODO this should just not be supported for fixed width columns, but just in case...
-             for (cudf::size_type b = 0; b < col_size; b++) {
-               col_tmp[b] = col_input[b + input_offset];
-             }
-             break;
-           }
-         }
-         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
-         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
-         cudf::size_type byte_bit_offset = col_index % 8;
-         uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
-         int32_t *valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
-         cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
-         // Now copy validity for the column
-         if (input_nm[col_index]) {
-           if (bit_is_set(input_nm[col_index], row_index)) {
-             atomicOr_block(valid_int, 1 << int_bit_offset);
-           } else {
-             atomicAnd_block(valid_int, ~(1 << int_bit_offset));
-           }
-         } else {
-           // It is valid so just set the bit
-           atomicOr_block(valid_int, 1 << int_bit_offset);
-         }
-       } // end column loop
-     }   // end row copy
-     // wait for the row_group to be totally copied into shared memory
-     __syncthreads();
- 
-     // Step 2: Copy the data back out
-     // We know row_size is always aligned with and a multiple of int64_t;
-     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-     int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
- 
-     cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
-     cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
-     cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-     if (row_index_end > num_rows) {
-       row_index_end = num_rows;
-     }
-     cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-     cudf::size_type shared_length = row_size * num_rows_in_group;
- 
-     cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
- 
-     cudf::size_type start_output_index =
-         (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
- 
-     for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
-          shared_index += shared_input_stride) {
-       long_output[start_output_index + shared_index] = long_shared[shared_index];
-     }
-     __syncthreads();
-     // Go for the next round
-   }
- }
- 
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- 
- struct block_info {
-   int start_col;
-   int start_row;
-   int end_col;
-   int end_row;
-   int buffer_num;
- 
-   __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
-                                                     size_type const *const col_sizes) const {
-     return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
-   }
-   __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
- 
-   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
- };
- 
- // When building the columns to return, we have to be mindful of the offset limit in cudf.
- // It is 32-bit and these data columns are capable of surpassing that easily. The data should
- // not be cut off exactly at the limit though due to the validity buffers. The most efficient
- // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
- // we keep track of the cut points for the validity, which we call row batches. If the row
- // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
- // hit. Note that this boundary is for our book-keeping with column pointers and not anything that
- // the kernel needs to worry about. We cut the output at convienient boundaries when assembling
- // the outgoing data stream.
- struct row_batch {
-   size_type num_bytes;
-   size_type row_count;
- };
- 
- /**
-  * @brief copy data from cudf columns into x format, which is row-based
-  *
-  * @param num_rows total number of rows in the table
-  * @param num_columns total number of columns in the table
-  * @param input_data pointer to raw table data
-  * @param input_nm pointer to validity data
-  * @param col_sizes array of sizes for each element in a column - one per column
-  * @param col_offsets offset into input data row for each column's start
-  * @param block_infos information about the blocks of work
-  * @param row_offsets offset to a specific row in the input data
-  * @param output_data pointer to output data
-  *
-  */
- __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
-                              const size_type shmem_used_per_block, const size_type num_block_infos,
-                              const int8_t **input_data, const size_type *col_sizes,
-                              const size_type *col_offsets, const block_info *block_infos,
-                              const size_type *row_offsets, int8_t **output_data) {
-   // We are going to copy the data in two passes.
-   // The first pass copies a chunk of data into shared memory.
-   // The second pass copies that chunk from shared memory out to the final location.
- 
-   // Because shared memory is limited we copy a subset of the rows at a time.
-   // This has been broken up for us in the block_info struct, so we don't have
-   // any calculation to do here, but it is important to note.
- 
-   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
-   auto group = cooperative_groups::this_thread_block();
-   extern __shared__ int8_t shared_data[];
-   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
- 
-   __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
-   if (group.thread_rank() == 0) {
-     for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-       init(&block_barrier[i], group.size());
-     }
-   }
- 
-   group.sync();
- 
-   auto const blocks_remaining =
-       std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
-                (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS);
- 
-   size_t fetch;
-   size_t subset;
-   for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
-     // Fetch ahead up to stages_count subsets
-     for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
-       auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch];
-       auto const num_fetch_cols = fetch_block.num_cols();
-       auto const num_fetch_rows = fetch_block.num_rows();
-       auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
-       auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-       auto const starting_column_offset = col_offsets[fetch_block.start_col];
-       auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
- 
-       // wait for the last use of the memory to be completed
-       if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) {
-         fetch_barrier.arrive_and_wait();
-       }
- 
-       // to do the copy we need to do n column copies followed by m element copies OR
-       // we have to do m element copies followed by r row copies. When going from column
-       // to row it is much easier to copy by elements first otherwise we would need a running
-       // total of the column sizes for our block, which isn't readily available. This makes it
-       // more appealing to copy element-wise from input data into shared matching the end layout
-       // and do row-based memcopies out.
- 
-       auto const shared_buffer_base = shared[fetch % stages_count];
-       for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
-         auto const relative_col = el / num_fetch_rows;
-         auto const relative_row = el % num_fetch_rows;
-         auto const absolute_col = relative_col + fetch_block.start_col;
-         auto const absolute_row = relative_row + fetch_block.start_row;
-         auto const col_size = col_sizes[absolute_col];
-         auto const col_offset = col_offsets[absolute_col];
-         auto const relative_col_offset = col_offset - starting_column_offset;
- 
-         auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
-         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
- 
-         // copy the element from global memory
-         switch (col_size) {
-           case 2:
-             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
-                                cuda::aligned_size_t<2>(col_size), fetch_barrier);
-             break;
-           case 4:
-             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
-                                cuda::aligned_size_t<4>(col_size), fetch_barrier);
-             break;
-           case 8:
-             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
-                                cuda::aligned_size_t<8>(col_size), fetch_barrier);
-             break;
-           default:
-             cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size,
-                                fetch_barrier);
-             break;
-         }
-       }
-     }
- 
-     auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
-     subset_barrier.arrive_and_wait();
- 
-     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
-     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
-     auto const column_offset = col_offsets[block.start_col];
-     auto const block_output_buffer = output_data[block.buffer_num];
- 
-     // copy entire rows to final dest
-     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
-          absolute_row += blockDim.x) {
-       auto const relative_row = absolute_row - block.start_row;
-       auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset;
-       auto const shared_offset = block_row_size * relative_row;
- 
-       cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset],
-                          cuda::aligned_size_t<8>(block_row_size), subset_barrier);
-     }
-   }
- 
-   // wait on the last copies to complete
-   for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-     block_barrier[i].arrive_and_wait();
-   }
- }
- 
- /**
-  * @brief copy data from row-based format to cudf columns
-  *
-  * @param num_rows total number of rows in the table
-  * @param num_columns total number of columns in the table
-  * @param shmem_used_per_block amount of shared memory that is used by a block
-  * @param offsets
-  * @param output_data pointer to output data, partitioned by data size
-  * @param validity_offsets offset into input data row for validity data
-  * @param block_infos information about the blocks of work
-  * @param num_block_infos number of infos in blocks array
-  * @param input_data pointer to input data
-  *
-  */
- __global__ void copy_validity_to_rows(
-     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
-     const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
-     const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
-   extern __shared__ int8_t shared_data[];
-   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-       shared_data, shared_data + shmem_used_per_block / 2};
- 
-   // per conversation with DaveB
-   // each thread of warp reads a single int32 of validity - so we read 128 bytes
-   // then ballot_sync the bits and write the result to shmem
-   // after we fill shared mem memcpy it out in a blob.
-   // probably need knobs for number of rows vs columns to balance read/write
-   auto group = cooperative_groups::this_thread_block();
- 
-   int const blocks_remaining =
-       std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
- 
-   __shared__ cuda::barrier<cuda::thread_scope_block>
-       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
-   if (group.thread_rank() == 0) {
-     for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-       init(&shared_block_barriers[i], group.size());
-     }
-   }
- 
-   group.sync();
- 
-   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-     if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
-       shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
-           .arrive_and_wait();
-     }
-     int8_t *this_shared_block = shared_blocks[validity_block % 2];
-     auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
- 
-     auto const num_block_cols = block.num_cols();
-     auto const num_block_rows = block.num_rows();
- 
-     auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
-     auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
-     auto const validity_data_row_length =
-         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
-     auto const total_sections = num_sections_x * num_sections_y;
- 
-     int const warp_id = threadIdx.x / detail::warp_size;
-     int const lane_id = threadIdx.x % detail::warp_size;
-     auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
- 
-     // the block is divided into sections. A warp operates on a section at a time.
-     for (int my_section_idx = warp_id; my_section_idx < total_sections;
-          my_section_idx += warps_per_block) {
-       // convert to rows and cols
-       auto const section_x = my_section_idx % num_sections_x;
-       auto const section_y = my_section_idx / num_sections_x;
-       auto const relative_col = section_x * 32 + lane_id;
-       auto const relative_row = section_y * 32;
-       auto const absolute_col = relative_col + block.start_col;
-       auto const absolute_row = relative_row + block.start_row;
-       auto const cols_left = num_columns - absolute_col;
-       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
- 
-       if (absolute_col < num_columns) {
-         auto my_data = input_nm[absolute_col] != nullptr ?
-                            input_nm[absolute_col][absolute_row / 32] :
-                            std::numeric_limits<uint32_t>::max();
- 
-         // every thread that is participating in the warp has a byte, but it's column-based
-         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
-         // make the bytes we actually write.
-         bitmask_type dw_mask = 1;
-         for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
-           auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
-           // lead thread in each warp writes data
-           auto const validity_write_offset =
-               validity_data_row_length * (relative_row + i) + relative_col / 8;
-           if (threadIdx.x % detail::warp_size == 0) {
-             if (cols_left <= 8) {
-               // write byte
-               this_shared_block[validity_write_offset] = validity_data & 0xFF;
-             } else if (cols_left <= 16) {
-               // write int16
-               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data & 0xFFFF;
-             } else if (cols_left <= 24) {
-               // write int16 and then int8
-               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data & 0xFFFF;
-               shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-             } else {
-               // write int32
-               *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data;
-             }
-           }
-         }
-       }
-     }
- 
-     // make sure entire block has finished copy
-     group.sync();
- 
-     auto const output_data_base =
-         output_data[block.buffer_num] + validity_offset + block.start_col / 8;
- 
-     // now async memcpy the shared memory out to the final destination
-     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
-       auto const relative_row = row - block.start_row;
-       auto const output_ptr = output_data_base + row_offsets[row];
-       auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
- 
-       cuda::memcpy_async(
-           output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes,
-           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-     }
-   }
- 
-   // wait for last blocks of data to arrive
-   for (int validity_block = 0;
-        validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-        ++validity_block) {
-     shared_block_barriers[validity_block].arrive_and_wait();
-   }
- }
- 
- static __device__ std::tuple<size_type, size_type>
- get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) {
-   auto const col_size_bytes = num_cols * col_size_size;
-   auto const col_offset_bytes = num_cols * col_offset_size;
- 
-   return {col_size_bytes, col_offset_bytes};
- }
- 
- /**
-  * @brief copy data from row-based format to cudf columns
-  *
-  * @param num_rows total number of rows in the table
-  * @param num_columns total number of columns in the table
-  * @param shmem_used_per_block amount of shared memory that is used by a block
-  * @param row_offsets
-  * @param output_data
-  * @param output_nm
-  * @param col_sizes array of sizes for each element in a column - one per column
-  * @param col_offsets offset into input data row for each column's start
-  * @param block_infos information about the blocks of work
-  * @param input_data pointer to input data
-  *
-  */
- __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
-                                 const size_type shmem_used_per_block, const size_type *row_offsets,
-                                 int8_t **output_data, const size_type *_col_sizes,
-                                 const size_type *_col_offsets, const block_info *block_infos,
-                                 const size_type num_block_infos, const int8_t *input_data) {
-   // We are going to copy the data in two passes.
-   // The first pass copies a chunk of data into shared memory.
-   // The second pass copies that chunk from shared memory out to the final location.
- 
-   // Because shared memory is limited we copy a subset of the rows at a time.
-   // This has been broken up for us in the block_info struct, so we don't have
-   // any calculation to do here, but it is important to note.
- 
-   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
-   // to shared memory for each of the blocks that we work on
- 
-   constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
-   auto group = cooperative_groups::this_thread_block();
-   extern __shared__ int8_t shared_data[];
-   int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
- 
-   __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
-   if (group.thread_rank() == 0) {
-     for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-       init(&block_barrier[i], group.size());
-     }
-   }
- 
-   group.sync();
- 
-   auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
-                                    (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
- 
-   size_t fetch_index;
-   size_t processing_index;
-   for (processing_index = fetch_index = 0; processing_index < blocks_remaining;
-        ++processing_index) {
-     // Fetch ahead up to stages_count groups
-     for (; fetch_index < static_cast<size_t>(blocks_remaining) &&
-            fetch_index < (processing_index + stages_count);
-          ++fetch_index) {
-       auto const fetch_block =
-           block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index];
-       auto const fetch_block_start_row = fetch_block.start_row;
-       auto const fetch_block_end_row = fetch_block.end_row;
-       auto const starting_col_offset = _col_offsets[fetch_block.start_col];
-       auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes);
-       auto const num_fetch_cols = fetch_block.num_cols();
-       auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-           sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols);
-       auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
- 
-       // if we have fetched all buffers, we need to wait for processing
-       // to complete on them before we can use them again
-       if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
-         fetch_barrier.arrive_and_wait();
-       }
- 
-       auto shared_row_offset = 0;
-       // copy the data for column sizes
-       cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
-                          &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
-       shared_row_offset += col_size_bytes;
-       // copy the data for column offsets
-       cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
-                          &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
-       shared_row_offset += col_offset_bytes;
-       shared_row_offset = align_offset(shared_row_offset, 8);
- 
-       for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
-            row <= fetch_block_end_row; row += blockDim.x) {
-         auto shared_offset =
-             (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
-         // copy the main
-         cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset],
-                            &input_data[row_offsets[row] + starting_col_offset],
-                            fetch_block_row_size, fetch_barrier);
-       }
-     }
- 
-     auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED];
- 
-     // ensure our data is ready
-     processing_barrier.arrive_and_wait();
- 
-     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index];
-     auto const rows_in_block = block.num_rows();
-     auto const cols_in_block = block.num_cols();
- 
-     auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-         sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block);
-     auto shared_col_sizes = reinterpret_cast<size_type *>(shared[processing_index % stages_count]);
-     auto shared_col_offsets =
-         reinterpret_cast<size_type *>(&shared[processing_index % stages_count][col_size_bytes]);
- 
-     auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
- 
-     auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes);
- 
-     // now we copy from shared memory to final destination.
-     // the data is laid out in rows in shared memory, so the reads
-     // for a column will be "vertical". Because of this and the different
-     // sizes for each column, this portion is handled on row/column basis.
-     // to prevent each thread working on a single row and also to ensure
-     // that all threads can do work in the case of more threads than rows,
-     // we do a global index instead of a double for loop with col/row.
-     for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
-       auto const relative_col = index % cols_in_block;
-       auto const relative_row = index / cols_in_block;
-       auto const absolute_col = relative_col + block.start_col;
-       auto const absolute_row = relative_row + block.start_row;
- 
-       auto const shared_memory_row_offset = block_row_size * relative_row;
-       auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
-                                         shared_memory_row_offset + shared_row_offset;
-       auto const column_size = shared_col_sizes[relative_col];
- 
-       int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset];
-       int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
- 
-       cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier);
-     }
-     group.sync();
-   }
- 
-   // wait on the last copies to complete
-   for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-     block_barrier[i].arrive_and_wait();
-   }
- }
- 
- /**
-  * @brief copy data from row-based format to cudf columns
-  *
-  * @param num_rows total number of rows in the table
-  * @param num_columns total number of columns in the table
-  * @param shmem_used_per_block amount of shared memory that is used by a block
-  * @param offsets
-  * @param output_nm
-  * @param validity_offsets offset into input data row for validity data
-  * @param block_infos information about the blocks of work
-  * @param num_block_infos number of infos in blocks array
-  * @param input_data pointer to input data
-  *
-  */
- __global__ void copy_validity_from_rows(
-     const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
-     const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
-     const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
-   extern __shared__ int8_t shared_data[];
-   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-       shared_data, shared_data + shmem_used_per_block / 2};
- 
-   // per conversation with DaveB
-   // each thread of warp reads a single byte of validity - so we read 32 bytes
-   // then ballot_sync the bits and write the result to shmem
-   // after we fill shared mem memcpy it out in a blob.
-   // probably need knobs for number of rows vs columns to balance read/write
-   auto group = cooperative_groups::this_thread_block();
- 
-   int const blocks_remaining =
-       std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
- 
-   __shared__ cuda::barrier<cuda::thread_scope_block>
-       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
-   if (group.thread_rank() == 0) {
-     for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-       init(&shared_block_barriers[i], group.size());
-     }
-   }
- 
-   group.sync();
- 
-   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-     auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-     if (validity_block != validity_index) {
-       shared_block_barriers[validity_index].arrive_and_wait();
-     }
-     int8_t *this_shared_block = shared_blocks[validity_block % 2];
-     auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
-     auto const block_start_col = block.start_col;
-     auto const block_start_row = block.start_row;
-     auto const num_block_cols = block.num_cols();
-     auto const num_block_rows = block.num_rows();
-     auto const num_sections_x = (num_block_cols + 7) / 8;
-     auto const num_sections_y = (num_block_rows + 31) / 32;
-     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
-     auto const total_sections = num_sections_x * num_sections_y;
-     int const warp_id = threadIdx.x / detail::warp_size;
-     int const lane_id = threadIdx.x % detail::warp_size;
-     auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
- 
-     // the block is divided into sections. A warp operates on a section at a time.
-     for (int my_section_idx = warp_id; my_section_idx < total_sections;
-          my_section_idx += warps_per_block) {
-       // convert to rows and cols
-       auto const section_x = my_section_idx % num_sections_x;
-       auto const section_y = my_section_idx / num_sections_x;
-       auto const relative_col = section_x * 8;
-       auto const relative_row = section_y * 32 + lane_id;
-       auto const absolute_col = relative_col + block_start_col;
-       auto const absolute_row = relative_row + block_start_row;
-       auto const rows_left = num_rows - absolute_row;
- 
-       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
- 
-       if (absolute_row < num_rows) {
-         auto const my_byte =
-             input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
- 
-         // so every thread that is participating in the warp has a byte, but it's row-based
-         // data and we need it in column-based. So we shiffle the bits around to make
-         // the bytes we actually write.
-         for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
-              ++i, byte_mask <<= 1) {
-           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
-           // lead thread in each warp writes data
-           if (threadIdx.x % detail::warp_size == 0) {
-             auto const validity_write_offset =
-                 validity_data_col_length * (relative_col + i) + relative_row / 8;
- 
-             if (rows_left <= 8) {
-               // write byte
-               this_shared_block[validity_write_offset] = validity_data & 0xFF;
-             } else if (rows_left <= 16) {
-               // write int16
-               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data & 0xFFFF;
-             } else if (rows_left <= 24) {
-               // write int16 and then int8
-               *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data & 0xFFFF;
-               shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-             } else {
-               // write int32
-               *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
-                   validity_data;
-             }
-           }
-         }
-       }
-     }
- 
-     // make sure entire block has finished copy
-     group.sync();
- 
-     // now async memcpy the shared
-     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
-       auto const relative_col = col - block.start_col;
- 
-       cuda::memcpy_async(
-           output_nm[col] + word_index(block_start_row),
-           &this_shared_block[validity_data_col_length * relative_col],
-           util::div_rounding_up_unsafe(num_block_rows, 8),
-           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
-     }
-   }
- 
-   // wait for last blocks of data to arrive
-   auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ?
-                                       NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED :
-                                       blocks_remaining;
-   for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
-     shared_block_barriers[validity_block].arrive_and_wait();
-   }
- }
- 
- #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- 
- /**
-  * Calculate the dimensions of the kernel for fixed width only columns.
-  * @param [in] num_columns the number of columns being copied.
-  * @param [in] num_rows the number of rows being copied.
-  * @param [in] size_per_row the size each row takes up when padded.
-  * @param [out] blocks the size of the blocks for the kernel
-  * @param [out] threads the size of the threads for the kernel
-  * @return the size in bytes of shared memory needed for each block.
-  */
- static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
-                                         const cudf::size_type num_rows,
-                                         const cudf::size_type size_per_row, dim3 &blocks,
-                                         dim3 &threads) {
-   // We have found speed degrades when a thread handles more than 4 columns.
-   // Each block is 2 dimensional. The y dimension indicates the columns.
-   // We limit this to 32 threads in the y dimension so we can still
-   // have at least 32 threads in the x dimension (1 warp) which should
-   // result in better coalescing of memory operations. We also
-   // want to guarantee that we are processing a multiple of 32 threads
-   // in the x dimension because we use atomic operations at the block
-   // level when writing validity data out to main memory, and that would
-   // need to change if we split a word of validity data between blocks.
-   int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4);
-   if (y_block_size > 32) {
-     y_block_size = 32;
-   }
-   int x_possible_block_size = 1024 / y_block_size;
-   // 48KB is the default setting for shared memory per block according to the cuda tutorials
-   // If someone configures the GPU to only have 16 KB this might not work.
-   int max_shared_size = 48 * 1024;
-   int max_block_size = max_shared_size / size_per_row;
-   // If we don't have enough shared memory there is no point in having more threads
-   // per block that will just sit idle
-   max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
-   // Make sure that the x dimension is a multiple of 32 this not only helps
-   // coalesce memory access it also lets us do a ballot sync for validity to write
-   // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
-   // dimension is associated with one or more warps, that should correspond to the validity
-   // words directly.
-   int block_size = (max_block_size / 32) * 32;
-   CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
- 
-   int num_blocks = (num_rows + block_size - 1) / block_size;
-   if (num_blocks < 1) {
-     num_blocks = 1;
-   } else if (num_blocks > 10240) {
-     // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
-     // but in practice haveing too many can cause some overhead that I don't totally
-     // understand. Playing around with this haveing as little as 600 blocks appears
-     // to be able to saturate memory on V100, so this is an order of magnitude higher
-     // to try and future proof this a bit.
-     num_blocks = 10240;
-   }
-   blocks.x = num_blocks;
-   blocks.y = 1;
-   blocks.z = 1;
-   threads.x = block_size;
-   threads.y = y_block_size;
-   threads.z = 1;
-   return size_per_row * block_size;
- }
- 
- /**
-  * When converting to rows it is possible that the size of the table was too big to fit
-  * in a single column. This creates an output column for a subset of the rows in a table
-  * going from start row and containing the next num_rows.  Most of the parameters passed
-  * into this function are common between runs and should be calculated once.
-  */
- static std::unique_ptr<cudf::column>
- fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows,
-                             const cudf::size_type num_columns, const cudf::size_type size_per_row,
-                             rmm::device_uvector<cudf::size_type> &column_start,
-                             rmm::device_uvector<cudf::size_type> &column_size,
-                             rmm::device_uvector<const int8_t *> &input_data,
-                             rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
-                             const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row,
-                             rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
-   int64_t total_allocation = size_per_row * num_rows;
-   // We made a mistake in the split somehow
-   CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
- 
-   // Allocate and set the offsets row for the byte array
-   std::unique_ptr<cudf::column> offsets =
-       cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
- 
-   std::unique_ptr<cudf::column> data = cudf::make_numeric_column(
-       cudf::data_type(cudf::type_id::INT8), static_cast<cudf::size_type>(total_allocation),
-       cudf::mask_state::UNALLOCATED, stream, mr);
- 
-   dim3 blocks;
-   dim3 threads;
-   int shared_size =
-       detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
- 
-   copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-       start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
-       input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
- 
-   return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                                  rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
- }
- 
- static cudf::data_type get_data_type(const cudf::column_view &v) {
-   return v.type();
- }
- 
- static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
-   return std::all_of(schema.begin(), schema.end(),
-                      [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
- }
- 
- /**
-  * Given a set of fixed width columns, calculate how the data will be laid out in memory.
-  * @param [in] schema the types of columns that need to be laid out.
-  * @param [out] column_start the byte offset where each column starts in the row.
-  * @param [out] column_size the size in bytes of the data for each columns in the row.
-  * @return the size in bytes each row needs.
-  */
- static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
-                                                  std::vector<cudf::size_type> &column_start,
-                                                  std::vector<cudf::size_type> &column_size) {
-   // We guarantee that the start of each column is 64-bit aligned so anything can go
-   // there, but to make the code simple we will still do an alignment for it.
-   int32_t at_offset = 0;
-   for (auto col = schema.begin(); col < schema.end(); col++) {
-     cudf::size_type s = cudf::size_of(*col);
-     column_size.emplace_back(s);
-     std::size_t allocation_needed = s;
-     std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
-     at_offset = align_offset(at_offset, alignment_needed);
-     column_start.emplace_back(at_offset);
-     at_offset += allocation_needed;
-   }
- 
-   // Now we need to add in space for validity
-   // Eventually we can think about nullable vs not nullable, but for now we will just always add
-   // it in
-   int32_t validity_bytes_needed =
-       (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
-   // validity comes at the end and is byte aligned so we can pack more in.
-   at_offset += validity_bytes_needed;
-   // Now we need to pad the end so all rows are 64 bit aligned
-   return align_offset(at_offset, 8); // 8 bytes (64 bits)
- }
- 
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- 
- template <typename iterator>
- static size_type compute_column_information(iterator begin, iterator end,
-                                             std::vector<size_type> &column_starts,
-                                             std::vector<size_type> &column_sizes) //,
- // std::function<void(T)> nested_type_cb)
- {
-   size_type fixed_width_size_per_row = 0;
-   for (auto cv = begin; cv != end; ++cv) {
-     auto col_type = std::get<0>(*cv);
-     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
- 
-     //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
- 
-     // a list or string column will write a single uint64
-     // of data here for offset/length
-     auto col_size = nested_type ? 8 : size_of(col_type);
- 
-     // align size for this type
-     std::size_t const alignment_needed = col_size; // They are the same for fixed width types
-     fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
-     column_starts.push_back(fixed_width_size_per_row);
-     column_sizes.push_back(col_size);
-     fixed_width_size_per_row += col_size;
-   }
- 
-   auto validity_offset = fixed_width_size_per_row;
-   column_starts.push_back(validity_offset);
- 
-   return fixed_width_size_per_row;
- }
- 
- std::vector<detail::block_info>
- build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
-                            size_type const &shmem_limit_per_block,
-                            std::vector<row_batch> const &row_batches) {
-   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-   auto const column_stride = align_offset(
-       [&]() {
-         if (desired_rows_and_columns > num_columns) {
-           // not many columns, group it into 8s and ship it off
-           return std::min(8, num_columns);
-         } else {
-           return util::round_down_safe(desired_rows_and_columns, 8);
-         }
-       }(),
-       8);
-   // we fit as much as we can given the column stride
-   // note that an element in the table takes just 1 bit, but a row with a single
-   // element still takes 8 bytes!
-   auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
-   auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
- 
-   std::vector<detail::block_info> validity_block_infos;
-   for (int col = 0; col < num_columns; col += column_stride) {
-     int current_window_row_batch = 0;
-     int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-     int row = 0;
-     while (row < num_rows) {
-       if (rows_left_in_batch == 0) {
-         current_window_row_batch++;
-         rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-       }
-       int const window_height = std::min(row_stride, rows_left_in_batch);
- 
-       validity_block_infos.emplace_back(detail::block_info{
-           col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
-       row += window_height;
-       rows_left_in_batch -= window_height;
-     }
-   }
- 
-   return validity_block_infos;
- }
- 
- std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
-                                           std::vector<size_type> const &column_starts,
-                                           std::vector<row_batch> const &row_batches,
-                                           size_type const total_number_of_rows,
-                                           size_type const &shmem_limit_per_block) {
-   std::vector<block_info> block_infos;
- 
-   // block infos are organized with the windows going "down" the columns
-   // this provides the most coalescing of memory access
-   int current_window_width = 0;
-   int current_window_start_col = 0;
- 
-   // build the blocks for a specific set of columns
-   auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
-                           int const start_col, int const end_col, int const desired_window_height) {
-     int current_window_start_row = 0;
-     int current_window_row_batch = 0;
-     int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-     int i = 0;
-     while (i < total_number_of_rows) {
-       if (rows_left_in_batch == 0) {
-         current_window_row_batch++;
-         rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-       }
-       int const window_height = std::min(desired_window_height, rows_left_in_batch);
- 
-       block_infos.emplace_back(detail::block_info{
-           start_col, current_window_start_row, end_col,
-           std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
-           current_window_row_batch});
- 
-       i += window_height;
-       current_window_start_row += window_height;
-       rows_left_in_batch -= window_height;
-     }
-   };
- 
-   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-   // would be memory cache line sized access, but since other blocks will read/write the edges
-   // this may not turn out to be overly important. For now, we will attempt to build a square
-   // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
-   // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
-   // trick is that it's in bytes, not rows or columns.
-   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-   int const window_height = std::clamp(
-       util::round_up_safe<int>(
-           std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
-                    total_number_of_rows),
-           32),
-       1, row_batches[0].row_count);
- 
-   auto calc_admin_data_size = [](int num_cols) -> size_type {
-     // admin data is the column sizes and column start information.
-     // this is copied to shared memory as well and needs to be accounted for
-     // in the window calculation.
-     return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
-   };
- 
-   int row_size = 0;
- 
-   // march each column and build the blocks of appropriate sizes
-   for (unsigned int col = 0; col < column_sizes.size(); ++col) {
-     auto const col_size = column_sizes[col];
- 
-     // align size for this type
-     std::size_t alignment_needed = col_size; // They are the same for fixed width types
-     auto row_size_aligned = detail::align_offset(row_size, alignment_needed);
-     auto row_size_with_this_col = row_size_aligned + col_size;
-     auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
- 
-     if (row_size_with_end_pad * window_height +
-             calc_admin_data_size(col - current_window_start_col) >
-         shmem_limit_per_block) {
-       // too large, close this window, generate vertical blocks and restart
-       build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
-       row_size =
-           detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-       row_size += col_size; // alignment required for shared memory window boundary to match
-                             // alignment of output row
-       current_window_start_col = col;
-       current_window_width = 0;
-     } else {
-       row_size = row_size_with_this_col;
-       current_window_width++;
-     }
-   }
- 
-   // build last set of blocks
-   if (current_window_width > 0) {
-     build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
-   }
- 
-   return block_infos;
- }
- 
- #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- 
- } // namespace detail
- 
- std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource *mr) {
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-   // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-   // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-   // Potential optimization for window sizes.
-   const size_type num_columns = tbl.num_columns();
-   const size_type num_rows = tbl.num_rows();
- 
-   int device_id;
-   CUDA_TRY(cudaGetDevice(&device_id));
-   int total_shmem;
-   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
- 
-   // TODO: why?
-   total_shmem -= 1024;
-   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
- 
-   // break up the work into blocks, which are a starting and ending row/col #.
-   // this window size is calculated based on the shared memory size available
-   // we want a single block to fill up the entire shared memory space available
-   // for the transpose-like conversion.
- 
-   // There are two different processes going on here. The GPU conversion of the data
-   // and the writing of the data into the list of byte columns that are a maximum of
-   // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
-   // this limitation because the column must own the data inside and as a result it must be
-   // a distinct allocation for that column. Copying the data into these final buffers would
-   // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
-   // The windows are broken at the boundaries of specific rows based on the row sizes up
-   // to that point. These are row batches and they are decided first before building the
-   // windows so the windows can be properly cut around them.
- 
-   // Get the pointers to the input columnar data ready
-   std::vector<int8_t const *> input_data;
-   std::vector<bitmask_type const *> input_nm;
-   input_data.reserve(num_columns);
-   input_nm.reserve(num_columns);
-   for (size_type column_number = 0; column_number < num_columns; column_number++) {
-     column_view cv = tbl.column(column_number);
-     auto const col_type = cv.type();
-     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
- 
-     if (!nested_type) {
-       input_data.emplace_back(cv.data<int8_t>());
-       input_nm.emplace_back(cv.null_mask());
-     }
-   }
- 
-   auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-   auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
- 
-   std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
-   std::vector<size_type> row_offsets;   // offset from the start of the data to this row
-   std::vector<size_type> column_sizes;  // byte size of each column
-   std::vector<size_type> column_starts; // offset of column inside a row including alignment
-   std::vector<column_view>
-       variable_width_columns; // list of the variable width columns in the table
-   row_sizes.reserve(num_rows);
-   row_offsets.reserve(num_rows);
-   column_sizes.reserve(num_columns);
-   column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
- 
-   auto iter =
-       thrust::make_transform_iterator(thrust::make_counting_iterator(0),
-                                       [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-                                         return std::make_tuple(tbl.column(i).type(), tbl.column(i));
-                                       });
- 
-   size_type fixed_width_size_per_row =
-       detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
- 
-   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
-   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
- 
-   std::vector<detail::row_batch> row_batches;
- 
-   uint64_t row_batch_size = 0;
-   uint64_t total_table_size = 0;
-   size_type row_batch_rows = 0;
-   uint64_t row_offset = 0;
- 
-   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
-   // calculate the size of each row's variable-width data and validity as well.
-   auto validity_size = num_bitmask_words(num_columns) * 4;
-   // thrust
-   for (int row = 0; row < num_rows; ++row) {
-     auto aligned_row_batch_size =
-         detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
-     row_sizes[row] = fixed_width_size_per_row;
-     // validity is byte aligned
-     row_sizes[row] += validity_size;
-     // variable width data is 8-byte aligned
-     row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned
- 
-     if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
-         (uint64_t)std::numeric_limits<size_type>::max()) {
-       // a new batch starts at the last 32-row boundary
-       row_batches.push_back(
-           detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-       row_batch_size = 0;
-       row_batch_rows = row_batch_rows & 31;
-       row_offset = 0;
-       aligned_row_batch_size = 0;
-     }
-     row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
-     row_offsets.push_back(row_offset);
-     row_batch_size = aligned_row_batch_size + row_sizes[row];
-     row_offset += row_sizes[row];
-     total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
-     total_table_size += row_sizes[row];
-     row_batch_rows++;
-   }
-   if (row_batch_size > 0) {
-     row_batches.push_back(
-         detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
-   }
- 
-   auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
- 
-   std::vector<rmm::device_buffer> output_buffers;
-   std::vector<int8_t *> output_data;
-   output_data.reserve(row_batches.size());
-   for (uint i = 0; i < row_batches.size(); ++i) {
-     rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
-     output_data.push_back(static_cast<int8_t *>(temp.data()));
-     output_buffers.push_back(std::move(temp));
-   }
-   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
- 
-   std::vector<detail::block_info> block_infos =
-       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
- 
-   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
- 
-   // blast through the entire table and convert it
-   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
-   dim3 threads(256);
- 
-   detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-       num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
-       dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
-       reinterpret_cast<int8_t **>(dev_output_data.data()));
- 
-   auto validity_block_infos =
-       build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
- 
-   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
-   dim3 validity_blocks(
-       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
-   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-   detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem,
-                                        stream.value()>>>(
-       num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
-       column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
-       dev_input_nm.data());
- 
-   // split up the output buffer into multiple buffers based on row batch sizes
-   // and create list of byte columns
-   int offset_offset = 0;
-   std::vector<std::unique_ptr<cudf::column>> ret;
-   for (uint i = 0; i < row_batches.size(); ++i) {
-     // compute offsets for this row batch
-     std::vector<size_type> offset_vals;
-     offset_vals.reserve(row_batches[i].row_count + 1);
-     size_type cur_offset = 0;
-     offset_vals.push_back(cur_offset);
-     for (int row = 0; row < row_batches[i].row_count; ++row) {
-       cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
-       offset_vals.push_back(cur_offset);
-     }
-     offset_offset += row_batches[i].row_count;
- 
-     auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
-     auto offsets = std::make_unique<column>(data_type{type_id::INT32},
-                                             (size_type)offset_vals.size(), dev_offsets.release());
- 
-     auto data = std::make_unique<column>(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes,
-                                          std::move(output_buffers[i]));
- 
-     ret.push_back(
-         cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0,
-                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
-   }
- 
-   return ret;
- #else
-   CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
-   return {};
- #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- }
- 
- std::vector<std::unique_ptr<cudf::column>>
- convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource *mr) {
-   const cudf::size_type num_columns = tbl.num_columns();
- 
-   std::vector<cudf::data_type> schema;
-   schema.resize(num_columns);
-   std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
- 
-   if (detail::are_all_fixed_width(schema)) {
-     std::vector<cudf::size_type> column_start;
-     std::vector<cudf::size_type> column_size;
- 
-     int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
-     auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
-     auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
- 
-     int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
-     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
-     // splitting validity at a specific row offset.  This might change in the future.
-     max_rows_per_batch = (max_rows_per_batch / 32) * 32;
- 
-     cudf::size_type num_rows = tbl.num_rows();
- 
-     // Get the pointers to the input columnar data ready
-     std::vector<const int8_t *> input_data;
-     std::vector<cudf::bitmask_type const *> input_nm;
-     for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
-       cudf::column_view cv = tbl.column(column_number);
-       input_data.emplace_back(cv.data<int8_t>());
-       input_nm.emplace_back(cv.null_mask());
-     }
-     auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-     auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
- 
-     using ScalarType = cudf::scalar_type_t<cudf::size_type>;
-     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-     zero->set_valid_async(true, stream);
-     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
- 
-     auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
-     step->set_valid_async(true, stream);
-     static_cast<ScalarType *>(step.get())
-         ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
- 
-     std::vector<std::unique_ptr<cudf::column>> ret;
-     for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
-       cudf::size_type row_count = num_rows - row_start;
-       row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-       ret.emplace_back(detail::fixed_width_convert_to_rows(
-           row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
-           dev_input_data, dev_input_nm, *zero, *step, stream, mr));
-     }
- 
-     return ret;
-   } else {
-     CUDF_FAIL("Only fixed width types are currently supported");
-   }
- }
- 
- std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
-                                                std::vector<cudf::data_type> const &schema,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource *mr) {
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-   // verify that the types are what we expect
-   cudf::column_view child = input.child();
-   cudf::type_id list_type = child.type().id();
-   CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-                "Only a list of bytes is supported as input");
- 
-   cudf::size_type num_columns = schema.size();
-   cudf::size_type num_rows = input.parent().size();
- 
-   int device_id;
-   CUDA_TRY(cudaGetDevice(&device_id));
-   int total_shmem;
-   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
- 
-   // TODO why?
-   total_shmem -= 1024;
-   int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
- 
-   std::vector<cudf::size_type> column_starts;
-   std::vector<cudf::size_type> column_sizes;
- 
-   auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
-     return std::make_tuple(schema[i], nullptr);
-   });
-   size_type fixed_width_size_per_row = detail::compute_column_information(
-       iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {});
- 
-   size_type validity_size = num_bitmask_words(num_columns) * 4;
- 
-   size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
- 
-   // Ideally we would check that the offsets are all the same, etc. but for now
-   // this is probably fine
-   CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
-   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
- 
-   // build the row_batches from the passed in list column
-   std::vector<detail::row_batch> row_batches;
- 
-   row_batches.push_back(detail::row_batch{child.size(), num_rows});
- 
-   // Allocate the columns we are going to write into
-   std::vector<std::unique_ptr<cudf::column>> output_columns;
-   std::vector<int8_t *> output_data;
-   std::vector<cudf::bitmask_type *> output_nm;
-   for (cudf::size_type i = 0; i < num_columns; i++) {
-     auto column = cudf::make_fixed_width_column(schema[i], num_rows,
-                                                 cudf::mask_state::UNINITIALIZED, stream, mr);
-     auto mut = column->mutable_view();
-     output_data.emplace_back(mut.data<int8_t>());
-     output_nm.emplace_back(mut.null_mask());
-     output_columns.emplace_back(std::move(column));
-   }
- 
-   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-   auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
- 
-   std::vector<detail::block_info> block_infos =
-       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
- 
-   auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
- 
-   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
-   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
-   detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-       dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
-       block_infos.size(), child.data<int8_t>());
- 
-   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-   auto const column_stride = [&]() {
-     if (desired_rows_and_columns > num_columns) {
-       // not many columns, group it into 64s and ship it off
-       return std::min(64, num_columns);
-     } else {
-       return util::round_down_safe(desired_rows_and_columns, 8);
-     }
-   }();
-   auto const row_stride = [&]() {
-     // we fit as much as we can, we know the column stride now, so calculate the row
-     return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
-     /*    if (desired_rows_and_columns > num_rows) {
-           return std::min(32, num_rows);
-         } else {
-           return util::round_down_safe(desired_rows_and_columns, 32);
-         }*/
-   }();
-   std::vector<detail::block_info> validity_block_infos;
-   for (int col = 0; col < num_columns; col += column_stride) {
-     for (int row = 0; row < num_rows; row += row_stride) {
-       validity_block_infos.emplace_back(
-           detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
-                              std::min(row + row_stride - 1, num_rows - 1)});
-     }
-   }
-   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
-   dim3 validity_blocks(
-       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
- 
-   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-   detail::
-       copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-           dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
-           validity_block_infos.size(), child.data<int8_t>());
- 
-   return std::make_unique<cudf::table>(std::move(output_columns));
- #else
-   CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
-   return {};
- #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
- }
- 
- std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
-     cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
-     rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
-   // verify that the types are what we expect
-   cudf::column_view child = input.child();
-   cudf::type_id list_type = child.type().id();
-   CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-                "Only a list of bytes is supported as input");
- 
-   cudf::size_type num_columns = schema.size();
- 
-   if (detail::are_all_fixed_width(schema)) {
-     std::vector<cudf::size_type> column_start;
-     std::vector<cudf::size_type> column_size;
- 
-     cudf::size_type num_rows = input.parent().size();
-     int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
- 
-     // Ideally we would check that the offsets are all the same, etc. but for now
-     // this is probably fine
-     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
-                  "The layout of the data appears to be off");
-     auto dev_column_start = make_device_uvector_async(column_start, stream);
-     auto dev_column_size = make_device_uvector_async(column_size, stream);
- 
-     // Allocate the columns we are going to write into
-     std::vector<std::unique_ptr<cudf::column>> output_columns;
-     std::vector<int8_t *> output_data;
-     std::vector<cudf::bitmask_type *> output_nm;
-     for (cudf::size_type i = 0; i < num_columns; i++) {
-       auto column = cudf::make_fixed_width_column(schema[i], num_rows,
-                                                   cudf::mask_state::UNINITIALIZED, stream, mr);
-       auto mut = column->mutable_view();
-       output_data.emplace_back(mut.data<int8_t>());
-       output_nm.emplace_back(mut.null_mask());
-       output_columns.emplace_back(std::move(column));
-     }
- 
-     auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-     auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
- 
-     dim3 blocks;
-     dim3 threads;
-     int shared_size =
-         detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
- 
-     detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-         num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
-         dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
- 
-     return std::make_unique<cudf::table>(std::move(output_columns));
-   } else {
-     CUDF_FAIL("Only fixed width types are currently supported");
-   }
- }
- 
- } // namespace cudf
- 
\ No newline at end of file
diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp
deleted file mode 100644
index b807b5cec81..00000000000
--- a/cpp/tests/row_conversion/row_conversion.cpp
+++ /dev/null
@@ -1,677 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/row_conversion.hpp>
-#include <cudf/types.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <limits>
-
-struct ColumnToRowTests : public cudf::test::BaseFixture {
-};
-struct RowToColumnTests : public cudf::test::BaseFixture {
-};
-
-TEST_F(ColumnToRowTests, Single)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Simple)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Tall)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Wide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, SingleByteWide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
-    views.push_back(cols.back());
-
-    schema.push_back(cudf::data_type{cudf::type_id::INT8});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Non2Power)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  constexpr auto num_rows = 6 * 1024 + 557;
-  for (int i = 0; i < 131; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Big)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 28; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Bigger)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 128 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Biggest)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 128 columns of 2 million rows
-  constexpr auto num_rows = 2 * 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Single)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Simple)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Tall)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Wide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({i}));  // rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, SingleByteWide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, AllTypes)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT64},
-                                      cudf::data_type{cudf::type_id::FLOAT64},
-                                      cudf::data_type{cudf::type_id::INT8},
-                                      cudf::data_type{cudf::type_id::BOOL8},
-                                      cudf::data_type{cudf::type_id::FLOAT32},
-                                      cudf::data_type{cudf::type_id::INT8},
-                                      cudf::data_type{cudf::type_id::INT32},
-                                      cudf::data_type{cudf::type_id::INT64}};
-
-  cudf::test::fixed_width_column_wrapper<int64_t> c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<double> c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0},
-                                                    {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<int8_t> c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> c3({true, false, false, true, false, false},
-                                                  {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<float> c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f},
-                                                   {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<int8_t> c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_point_column_wrapper<int32_t> c6(
-    {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2});
-  cudf::test::fixed_point_column_wrapper<int64_t> c7(
-    {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1});
-
-  cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7});
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, AllTypesLarge)
-{
-  std::vector<cudf::column> cols;
-  std::vector<cudf::data_type> schema{};
-
-  // 10 columns of each type with 1024 entries
-  constexpr int num_rows{1024};
-
-  std::default_random_engine re;
-  std::uniform_real_distribution<double> rand_double(std::numeric_limits<double>::min(),
-                                                     std::numeric_limits<double>::max());
-  std::uniform_int_distribution<int64_t> rand_int64(std::numeric_limits<int64_t>::min(),
-                                                    std::numeric_limits<int64_t>::max());
-  auto r = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) -> int64_t { return rand_int64(re); });
-  auto d = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) -> double { return rand_double(re); });
-
-  auto all_valid  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
-  auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
-  auto most_valid = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; });
-  auto few_valid = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; });
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows, all_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::INT8});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows, few_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::INT16});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    if (i < 5) {
-      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, few_valid)
-                        .release()
-                        .release());
-    } else {
-      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, none_valid)
-                        .release()
-                        .release());
-    }
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows, most_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::FLOAT32});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows, most_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::FLOAT64});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows, few_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::BOOL8});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
-         r, r + num_rows, all_valid)
-         .release()
-         .release());
-    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>(
-         r, r + num_rows, most_valid)
-         .release()
-         .release());
-    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_point_column_wrapper<int32_t>(
-                      r, r + num_rows, all_valid, numeric::scale_type{-2})
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32});
-  }
-
-  for (int i = 0; i < 10; ++i) {
-    cols.push_back(*cudf::test::fixed_point_column_wrapper<int64_t>(
-                      r, r + num_rows, most_valid, numeric::scale_type{-1})
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64});
-  }
-
-  std::vector<cudf::column_view> views(cols.begin(), cols.end());
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = cudf::convert_to_rows(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Non2Power)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  constexpr auto num_rows = 6 * 1024 + 557;
-  for (int i = 0; i < 131; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Big)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 28; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Bigger)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Biggest)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 5 * 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl =
-      cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 932afa4bb70..f5936e86bcd 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -50,7 +50,7 @@
 #include <thrust/iterator/transform_iterator.h>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8;
+constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
@@ -409,7 +409,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
       auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
 
       // wait for the last use of the memory to be completed
-      if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) {
+      if (fetch >= NUM_BLOCKS_PER_KERNEL_LOADED) {
         fetch_barrier.arrive_and_wait();
       }
 
@@ -525,7 +525,7 @@ __global__ void copy_validity_to_rows(
   group.sync();
 
   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+    if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
       shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
           .arrive_and_wait();
     }
@@ -645,10 +645,10 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num
  *
  */
 __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
-                                const size_type shmem_used_per_block, const size_type *row_offsets,
-                                int8_t **output_data, const size_type *_col_sizes,
-                                const size_type *_col_offsets, const block_info *block_infos,
-                                const size_type num_block_infos, const int8_t *input_data) {
+                               const size_type shmem_used_per_block, const size_type *row_offsets,
+                               int8_t **output_data, const size_type *_col_sizes,
+                               const size_type *_col_offsets, const block_info *block_infos,
+                               const size_type num_block_infos, const int8_t *input_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -819,8 +819,8 @@ __global__ void copy_validity_from_rows(
   group.sync();
 
   for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-    if (validity_block != validity_index) {
+    if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
+      auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
       shared_block_barriers[validity_index].arrive_and_wait();
     }
     int8_t *this_shared_block = shared_blocks[validity_block % 2];
@@ -1251,7 +1251,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
   // TODO: why?
   total_shmem -= 1024;
-  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+  int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
@@ -1368,7 +1368,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   std::vector<detail::block_info> block_infos =
       build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
 
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
+  auto dev_block_infos = make_device_uvector_async(block_infos, stream);
 
   // blast through the entire table and convert it
   dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
@@ -1382,12 +1382,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   auto validity_block_infos =
       build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
 
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream);
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-  detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem,
-                                       stream.value()>>>(
+  detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
       column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
       dev_input_nm.data());
@@ -1508,7 +1507,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 
   // TODO why?
   total_shmem -= 1024;
-  int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
+  int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
   std::vector<cudf::size_type> column_starts;
   std::vector<cudf::size_type> column_sizes;
@@ -1590,7 +1589,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   }
   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
+      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
 
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
   detail::

From e9938b96890e2bca0591a4ab857f8c36c2bf4c49 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Sat, 23 Oct 2021 01:37:52 +0000
Subject: [PATCH 63/80] removing unused header, suppressing shared warning for
 barrier, updating java bindings to use the correct namespace

---
 cpp/include/cudf/row_conversion.hpp        | 51 ----------------------
 java/src/main/native/src/TableJni.cpp      |  9 ++--
 java/src/main/native/src/row_conversion.cu |  6 ++-
 3 files changed, 9 insertions(+), 57 deletions(-)
 delete mode 100644 cpp/include/cudf/row_conversion.hpp

diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp
deleted file mode 100644
index 5d799f4c596..00000000000
--- a/cpp/include/cudf/row_conversion.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <memory>
-
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/table/table_view.hpp>
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
-  cudf::table_view const& tbl,
-  // TODO need something for validity
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
-  cudf::table_view const& tbl,
-  // TODO need something for validity
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
-  cudf::lists_column_view const& input,
-  std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows(
-  cudf::lists_column_view const& input,
-  std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-}  // namespace cudf
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 45403f1eb0d..d7209a23ede 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -35,7 +35,6 @@
 #include <cudf/replace.hpp>
 #include <cudf/reshape.hpp>
 #include <cudf/rolling.hpp>
-#include <cudf/row_conversion.hpp>
 #include <cudf/search.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
@@ -2697,7 +2696,7 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols =
-        cudf::convert_to_rows_fixed_width_optimized(*n_input_table);
+        cudf::java::convert_to_rows_fixed_width_optimized(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2715,7 +2714,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   try {
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::convert_to_rows(*n_input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::java::convert_to_rows(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2742,7 +2741,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
     std::unique_ptr<cudf::table> result =
-        cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec);
+        cudf::java::convert_from_rows_fixed_width_optimized(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
@@ -2765,7 +2764,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::unique_ptr<cudf::table> result = cudf::convert_from_rows(list_input, types_vec);
+    std::unique_ptr<cudf::table> result = cudf::java::convert_from_rows(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index f5936e86bcd..af26e4c0b0d 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -34,7 +34,6 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
-#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
@@ -49,12 +48,17 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
+#include "row_conversion.hpp"
+
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
 constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
+
+// needed to suppress warning about cuda::barrier
+#pragma diag_suppress static_var_with_dynamic_init
 #endif
 
 using cudf::detail::make_device_uvector_async;

From 3c6b1e5ebff9f8265f5fbb47be457e0a68fc98a8 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Sat, 30 Oct 2021 01:00:38 +0000
Subject: [PATCH 64/80] updating code to build block infos with thrust on the
 gpu

---
 java/src/main/native/src/row_conversion.cu | 670 +++++++++++++--------
 1 file changed, 418 insertions(+), 252 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index af26e4c0b0d..87ab1ed49d8 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -21,6 +21,8 @@
 #include <tuple>
 
 #include <cooperative_groups.h>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
@@ -34,6 +36,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
+#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
@@ -47,8 +50,7 @@
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-
-#include "row_conversion.hpp"
+#include <thrust/scan.h>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;
@@ -64,7 +66,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 using cudf::detail::make_device_uvector_async;
 using rmm::device_uvector;
 namespace cudf {
-
+namespace java {
 namespace detail {
 
 static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) {
@@ -324,6 +326,11 @@ __global__ void copy_to_rows_fixed_width_optimized(
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
+/**
+ * @brief The GPU blocks work on one or more block_info structs of data.
+ *        This structure defined the workspace for the block.
+ *
+ */
 struct block_info {
   int start_col;
   int start_row;
@@ -340,38 +347,36 @@ struct block_info {
   __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
 };
 
-// When building the columns to return, we have to be mindful of the offset limit in cudf.
-// It is 32-bit and these data columns are capable of surpassing that easily. The data should
-// not be cut off exactly at the limit though due to the validity buffers. The most efficient
-// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes
-// we keep track of the cut points for the validity, which we call row batches. If the row
-// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we
-// hit. Note that this boundary is for our book-keeping with column pointers and not anything that
-// the kernel needs to worry about. We cut the output at convienient boundaries when assembling
-// the outgoing data stream.
+/**
+ * @brief Returning rows is done in a byte cudf column. This is limited in size by
+ *        `size_type` and so output is broken into batches of rows that fit inside
+ *        this limit.
+ *
+ */
 struct row_batch {
   size_type num_bytes;
   size_type row_count;
+  device_uvector<size_type> row_offsets;
 };
 
 /**
- * @brief copy data from cudf columns into x format, which is row-based
+ * @brief copy data from cudf columns into JCUDF format, which is row-based
  *
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
+ * @param shmem_used_per_block shared memory amount each `block_info` is using
+ * @param block_infos span of `block_info` structs the define the work
  * @param input_data pointer to raw table data
- * @param input_nm pointer to validity data
  * @param col_sizes array of sizes for each element in a column - one per column
  * @param col_offsets offset into input data row for each column's start
- * @param block_infos information about the blocks of work
  * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data
  *
  */
 __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
-                             const size_type shmem_used_per_block, const size_type num_block_infos,
-                             const int8_t **input_data, const size_type *col_sizes,
-                             const size_type *col_offsets, const block_info *block_infos,
+                             const size_type shmem_used_per_block,
+                             device_span<const block_info> block_infos, const int8_t **input_data,
+                             const size_type *col_sizes, const size_type *col_offsets,
                              const size_type *row_offsets, int8_t **output_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -396,7 +401,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
   group.sync();
 
   auto const blocks_remaining =
-      std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
+      std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
                (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS);
 
   size_t fetch;
@@ -491,23 +496,25 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
+ * @param row_offsets offset to a specific row in the input data
  * @param output_data pointer to output data, partitioned by data size
  * @param validity_offsets offset into input data row for validity data
  * @param block_infos information about the blocks of work
- * @param num_block_infos number of infos in blocks array
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_validity_to_rows(
-    const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
-    const size_type *row_offsets, int8_t **output_data, const size_type validity_offset,
-    const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) {
+__global__ void copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
+                                      const size_type shmem_used_per_block,
+                                      const size_type *row_offsets, int8_t **output_data,
+                                      const size_type validity_offset,
+                                      device_span<const block_info> block_infos,
+                                      const bitmask_type **input_nm) {
   extern __shared__ int8_t shared_data[];
   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
       shared_data, shared_data + shmem_used_per_block / 2};
 
-  // per conversation with DaveB
+  using cudf::detail::warp_size;
+
   // each thread of warp reads a single int32 of validity - so we read 128 bytes
   // then ballot_sync the bits and write the result to shmem
   // after we fill shared mem memcpy it out in a blob.
@@ -515,7 +522,7 @@ __global__ void copy_validity_to_rows(
   auto group = cooperative_groups::this_thread_block();
 
   int const blocks_remaining =
-      std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+      std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
 
   __shared__ cuda::barrier<cuda::thread_scope_block>
@@ -545,9 +552,9 @@ __global__ void copy_validity_to_rows(
         align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
 
-    int const warp_id = threadIdx.x / detail::warp_size;
-    int const lane_id = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
@@ -567,7 +574,7 @@ __global__ void copy_validity_to_rows(
                            input_nm[absolute_col][absolute_row / 32] :
                            std::numeric_limits<uint32_t>::max();
 
-        // every thread that is participating in the warp has a byte, but it's column-based
+        // every thread that is participating in the warp has 4 bytes, but it's column-based
         // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
         // make the bytes we actually write.
         bitmask_type dw_mask = 1;
@@ -576,7 +583,7 @@ __global__ void copy_validity_to_rows(
           // lead thread in each warp writes data
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
-          if (threadIdx.x % detail::warp_size == 0) {
+          if (threadIdx.x % warp_size == 0) {
             if (cols_left <= 8) {
               // write byte
               this_shared_block[validity_write_offset] = validity_data & 0xFF;
@@ -625,6 +632,14 @@ __global__ void copy_validity_to_rows(
   }
 }
 
+/**
+ * @brief Admin data is data stored in shared memory that isn't actual column data
+ *
+ * @param col_size_size size of the column size data.
+ * @param col_offset_size size of the column offset data.
+ * @param num_cols number of columns in the block.
+ * @return tuple of the size of column and offset admin data.
+ */
 static __device__ std::tuple<size_type, size_type>
 get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) {
   auto const col_size_bytes = num_cols * col_size_size;
@@ -639,9 +654,8 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param row_offsets
- * @param output_data
- * @param output_nm
+ * @param row_offsets offset to a specific row in the input data
+ * @param output_data pointers to column data
  * @param col_sizes array of sizes for each element in a column - one per column
  * @param col_offsets offset into input data row for each column's start
  * @param block_infos information about the blocks of work
@@ -651,8 +665,9 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num
 __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
                                const size_type shmem_used_per_block, const size_type *row_offsets,
                                int8_t **output_data, const size_type *_col_sizes,
-                               const size_type *_col_offsets, const block_info *block_infos,
-                               const size_type num_block_infos, const int8_t *input_data) {
+                               const size_type *_col_offsets,
+                               device_span<const block_info> block_infos,
+                               const int8_t *input_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -678,8 +693,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
 
   group.sync();
 
-  auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
-                                   (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
+  auto blocks_remaining =
+      std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
+               (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
 
   size_t fetch_index;
   size_t processing_index;
@@ -785,23 +801,24 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param offsets
- * @param output_nm
+ * @param row_offsets offset to a specific row in the input data
+ * @param output_nm pointers to null masks for columns
  * @param validity_offsets offset into input data row for validity data
  * @param block_infos information about the blocks of work
- * @param num_block_infos number of infos in blocks array
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_validity_from_rows(
-    const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block,
-    const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset,
-    const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) {
+__global__ void
+copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
+                        const size_type shmem_used_per_block, const size_type *row_offsets,
+                        cudf::bitmask_type **output_nm, const size_type validity_offset,
+                        device_span<const block_info> block_infos, const int8_t *input_data) {
   extern __shared__ int8_t shared_data[];
   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
       shared_data, shared_data + shmem_used_per_block / 2};
 
-  // per conversation with DaveB
+  using cudf::detail::warp_size;
+
   // each thread of warp reads a single byte of validity - so we read 32 bytes
   // then ballot_sync the bits and write the result to shmem
   // after we fill shared mem memcpy it out in a blob.
@@ -809,7 +826,7 @@ __global__ void copy_validity_from_rows(
   auto group = cooperative_groups::this_thread_block();
 
   int const blocks_remaining =
-      std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+      std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
                (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
 
   __shared__ cuda::barrier<cuda::thread_scope_block>
@@ -837,14 +854,14 @@ __global__ void copy_validity_from_rows(
     auto const num_sections_y = (num_block_rows + 31) / 32;
     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections = num_sections_x * num_sections_y;
-    int const warp_id = threadIdx.x / detail::warp_size;
-    int const lane_id = threadIdx.x % detail::warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size);
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
 
     // the block is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
          my_section_idx += warps_per_block) {
-      // convert to rows and cols
+      // convert section to row and col
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
       auto const relative_col = section_x * 8;
@@ -860,13 +877,13 @@ __global__ void copy_validity_from_rows(
             input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
 
         // so every thread that is participating in the warp has a byte, but it's row-based
-        // data and we need it in column-based. So we shiffle the bits around to make
+        // data and we need it in column-based. So we shuffle the bits around to make
         // the bytes we actually write.
         for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
              ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
-          if (threadIdx.x % detail::warp_size == 0) {
+          if (threadIdx.x % warp_size == 0) {
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
 
@@ -898,10 +915,10 @@ __global__ void copy_validity_from_rows(
     // now async memcpy the shared
     for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
       auto const relative_col = col - block.start_col;
+      auto const starting_address = output_nm[col] + word_index(block_start_row);
 
       cuda::memcpy_async(
-          output_nm[col] + word_index(block_start_row),
-          &this_shared_block[validity_data_col_length * relative_col],
+          starting_address, &this_shared_block[validity_data_col_length * relative_col],
           util::div_rounding_up_unsafe(num_block_rows, 8),
           shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
     }
@@ -919,7 +936,8 @@ __global__ void copy_validity_from_rows(
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
 /**
- * Calculate the dimensions of the kernel for fixed width only columns.
+ * @brief Calculate the dimensions of the kernel for fixed width only columns.
+ *
  * @param [in] num_columns the number of columns being copied.
  * @param [in] num_rows the number of rows being copied.
  * @param [in] size_per_row the size each row takes up when padded.
@@ -995,7 +1013,7 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty
                             rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
                             const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row,
                             rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
-  int64_t total_allocation = size_per_row * num_rows;
+  int64_t const total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
   CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
 
@@ -1020,17 +1038,14 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty
                                  rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
 }
 
-static cudf::data_type get_data_type(const cudf::column_view &v) {
-  return v.type();
-}
-
 static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
   return std::all_of(schema.begin(), schema.end(),
                      [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
 }
 
 /**
- * Given a set of fixed width columns, calculate how the data will be laid out in memory.
+ * @brief Given a set of fixed width columns, calculate how the data will be laid out in memory.
+ *
  * @param [in] schema the types of columns that need to be laid out.
  * @param [out] column_start the byte offset where each column starts in the row.
  * @param [out] column_size the size in bytes of the data for each columns in the row.
@@ -1065,19 +1080,25 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
+/**
+ * @brief Compute information about a table such as bytes per row and offsets.
+ *
+ * @tparam iterator iterator of column schema data
+ * @param begin starting iterator of column schema
+ * @param end ending iterator of column schema
+ * @param column_starts column start offsets
+ * @param column_sizes size in bytes of each column
+ * @return size of the fixed_width data portion of a row.
+ */
 template <typename iterator>
 static size_type compute_column_information(iterator begin, iterator end,
                                             std::vector<size_type> &column_starts,
-                                            std::vector<size_type> &column_sizes) //,
-// std::function<void(T)> nested_type_cb)
-{
+                                            std::vector<size_type> &column_sizes) {
   size_type fixed_width_size_per_row = 0;
   for (auto cv = begin; cv != end; ++cv) {
     auto col_type = std::get<0>(*cv);
     bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
 
-    //    if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); }
-
     // a list or string column will write a single uint64
     // of data here for offset/length
     auto col_size = nested_type ? 8 : size_of(col_type);
@@ -1096,6 +1117,15 @@ static size_type compute_column_information(iterator begin, iterator end,
   return fixed_width_size_per_row;
 }
 
+/**
+ * @brief Build `block_info` for the validity data to break up the work.
+ *
+ * @param num_columns number of columns in the table
+ * @param num_rows number of rows in the table
+ * @param shmem_limit_per_block size of shared memory available to a single gpu block
+ * @param row_batches batched row information for multiple output locations
+ * @return vector of `block_info` structs for validity data
+ */
 std::vector<detail::block_info>
 build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
                            size_type const &shmem_limit_per_block,
@@ -1139,43 +1169,202 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro
   return validity_block_infos;
 }
 
-std::vector<block_info> build_block_infos(std::vector<size_type> const &column_sizes,
-                                          std::vector<size_type> const &column_starts,
-                                          std::vector<row_batch> const &row_batches,
-                                          size_type const total_number_of_rows,
-                                          size_type const &shmem_limit_per_block) {
-  std::vector<block_info> block_infos;
+constexpr size_type max_batch_size = std::numeric_limits<size_type>::max();
+
+/**
+ * @brief Holds information about the batches of data to be processed
+ *
+ */
+struct batch_data {
+  std::vector<size_type> batch_row_boundaries;
+  device_uvector<size_type> input_data_row_offsets;
+  std::vector<row_batch> row_batches;
+
+  batch_data(size_type num_input_offsets, rmm::cuda_stream_view stream)
+      : input_data_row_offsets(num_input_offsets, stream){};
+};
 
+/**
+ * @brief Builds batches of rows that will fit in the size limit of a column.
+ *
+ * @tparam RowSize iterator that gives the size of a specific row of the table.
+ * @param num_rows Total number of rows in the table
+ * @param row_sizes iterator that gives the size of a specific row of the table.
+ * @param stream stream to operate on for this work
+ * @param mr memory resource used to allocate any returned data
+ * @returns vector of size_type's that indicate row numbers for batch boundaries and a
+ * device_uvector of row offsets
+ */
+
+template <typename RowSize>
+batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource *mr) {
+  auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
+  auto const num_batches = util::div_rounding_up_safe(total_size, max_batch_size);
+  auto const num_offsets = num_batches + 1;
+  batch_data ret(num_rows + 1, stream);
+
+  // at most max gpu memory / 2GB iterations.
+  ret.batch_row_boundaries.reserve(num_offsets);
+  ret.batch_row_boundaries.push_back(0);
+  size_type last_row_end = 0;
+  device_uvector<size_type> cumulative_row_sizes(num_rows, stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
+                         cumulative_row_sizes.begin());
+  while ((int)ret.batch_row_boundaries.size() < num_offsets) {
+    // find the next max_batch_size boundary
+    size_type const row_end =
+        ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(),
+                              cumulative_row_sizes.begin() + (num_rows - last_row_end),
+                              max_batch_size) -
+          cumulative_row_sizes.begin()) +
+         last_row_end);
+
+    // build offset list for each row in this batch
+    auto const num_entries = row_end - last_row_end + 1;
+    device_uvector<size_type> output_batch_row_offsets(num_entries, stream, mr);
+
+    auto row_size_iter_bounded = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0), [row_end, row_sizes, last_row_end] __device__(auto i) {
+          return i >= row_end ? 0 : row_sizes[i + last_row_end];
+        });
+
+    thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded,
+                           row_size_iter_bounded + num_entries, output_batch_row_offsets.begin());
+
+    ret.batch_row_boundaries.push_back(row_end);
+    auto const batch_bytes = output_batch_row_offsets.element(row_end, stream) -
+                             output_batch_row_offsets.element(last_row_end, stream);
+    auto const num_rows_in_batch = row_end - last_row_end;
+    ret.row_batches.push_back(
+        {batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)});
+    last_row_end = row_end;
+  }
+
+  auto row_size_iter = cudf::detail::make_counting_transform_iterator(
+      0, [row_sizes, num_rows] __device__(auto i) { return (i < num_rows) ? row_sizes[i] : 0; });
+  thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter, row_size_iter + num_rows + 1,
+                         ret.input_data_row_offsets.begin());
+
+  return ret;
+}
+
+/**
+ * @brief Computes the number of blocks necessary given a window height and batch offsets
+ *
+ * @param batch_row_offsets row offsets for each batch
+ * @param desired_window_height height of each window in the table
+ * @param stream stream to use
+ * @return number of windows necessary
+ */
+int compute_block_counts(device_span<size_type const> const &batch_row_offsets,
+                         int desired_window_height, rmm::cuda_stream_view stream) {
+  size_type const num_batches = batch_row_offsets.size() - 1;
+  device_uvector<size_type> num_blocks(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(
+      rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
+      [desired_window_height,
+       batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
+        return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] -
+                                                batch_row_offsets[batch_index],
+                                            desired_window_height);
+      });
+  return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+}
+
+/**
+ * @brief Builds the `block_info` structs for a given table.
+ *
+ * @param blocks span of blocks to populate
+ * @param batch_row_offsets offsets to row batches
+ * @param column_start starting column of the window
+ * @param column_end ending column of the window
+ * @param desired_window_height height of the window
+ * @param total_number_of_rows total number of rows in the table
+ * @param stream stream to use
+ * @return number of windows created
+ */
+size_type
+build_blocks(device_span<block_info> blocks,
+             device_uvector<size_type> const &batch_row_offsets, // comes from build_batches
+             int column_start, int column_end, int desired_window_height, int total_number_of_rows,
+             rmm::cuda_stream_view stream) {
+  size_type const num_batches = batch_row_offsets.size() - 1;
+  device_uvector<size_type> num_blocks(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(
+      rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
+      [desired_window_height,
+       batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
+        return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] -
+                                                batch_row_offsets[batch_index],
+                                            desired_window_height);
+      });
+
+  size_type const total_blocks =
+      thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+
+  device_uvector<size_type> block_starts(num_batches + 1, stream);
+  auto block_iter = cudf::detail::make_counting_transform_iterator(
+      0, [num_blocks = num_blocks.data(), num_batches] __device__(auto i) {
+        return (i < num_batches) ? num_blocks[i] : 0;
+      });
+  thrust::exclusive_scan(rmm::exec_policy(stream), block_iter, block_iter + num_batches + 1,
+                         block_starts.begin()); // in blocks
+
+  thrust::transform(
+      rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(),
+      [=, block_starts = block_starts.data(),
+       batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) {
+        // what batch this block falls in
+        auto const batch_index_iter =
+            thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index);
+        auto const batch_index = std::distance(block_starts, batch_index_iter) - 1;
+        // local index within the block
+        int const local_block_index = block_index - block_starts[batch_index];
+        // the start row for this batch.
+        int const batch_row_start = batch_row_offsets[batch_index];
+        // the start row for this block
+        int const block_row_start = batch_row_start + (local_block_index * desired_window_height);
+        // the end row for this block
+        int const max_row = std::min(total_number_of_rows - 1,
+                                     batch_index + 1 > num_batches ?
+                                         std::numeric_limits<int>::max() :
+                                         static_cast<int>(batch_row_offsets[batch_index + 1]) - 1);
+        int const block_row_end = std::min(
+            batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row);
+
+        // stuff the block
+        return block_info{column_start, block_row_start, column_end, block_row_end,
+                          static_cast<int>(batch_index)};
+      });
+
+  return total_blocks;
+}
+
+/**
+ * @brief Determines what data should be operated on by each block for the incoming table.
+ *
+ * @tparam WindowCallback Callback that receives the start and end columns of windows
+ * @param column_sizes vector of the size of each column
+ * @param column_starts vector of the offset of each column
+ * @param first_row_batch_size size of the first row batch to limit max window size since a window
+ * is unable to span batches
+ * @param total_number_of_rows total number of rows in the table
+ * @param shmem_limit_per_block shared memory allowed per block
+ * @param f callback function called when building a window
+ */
+template <typename WindowCallback>
+void determine_windows(std::vector<size_type> const &column_sizes,
+                       std::vector<size_type> const &column_starts,
+                       size_type const first_row_batch_size, size_type const total_number_of_rows,
+                       size_type const &shmem_limit_per_block, WindowCallback f) {
   // block infos are organized with the windows going "down" the columns
   // this provides the most coalescing of memory access
   int current_window_width = 0;
   int current_window_start_col = 0;
 
-  // build the blocks for a specific set of columns
-  auto build_blocks = [&block_infos, &row_batches, total_number_of_rows](
-                          int const start_col, int const end_col, int const desired_window_height) {
-    int current_window_start_row = 0;
-    int current_window_row_batch = 0;
-    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-    int i = 0;
-    while (i < total_number_of_rows) {
-      if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
-      }
-      int const window_height = std::min(desired_window_height, rows_left_in_batch);
-
-      block_infos.emplace_back(detail::block_info{
-          start_col, current_window_start_row, end_col,
-          std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1),
-          current_window_row_batch});
-
-      i += window_height;
-      current_window_start_row += window_height;
-      rows_left_in_batch -= window_height;
-    }
-  };
-
   // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
   // would be memory cache line sized access, but since other blocks will read/write the edges
   // this may not turn out to be overly important. For now, we will attempt to build a square
@@ -1183,12 +1372,10 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
   // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
   // trick is that it's in bytes, not rows or columns.
   size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-  int const window_height = std::clamp(
-      util::round_up_safe<int>(
-          std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0],
-                   total_number_of_rows),
-          32),
-      1, row_batches[0].row_count);
+  int const window_height =
+      std::clamp(util::round_up_safe<int>(
+                     std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32),
+                 1, first_row_batch_size);
 
   auto calc_admin_data_size = [](int num_cols) -> size_type {
     // admin data is the column sizes and column start information.
@@ -1213,7 +1400,8 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
             calc_admin_data_size(col - current_window_start_col) >
         shmem_limit_per_block) {
       // too large, close this window, generate vertical blocks and restart
-      build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height);
+      f(current_window_start_col, col == 0 ? col : col - 1, window_height);
+
       row_size =
           detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
       row_size += col_size; // alignment required for shared memory window boundary to match
@@ -1228,12 +1416,24 @@ std::vector<block_info> build_block_infos(std::vector<size_type> const &column_s
 
   // build last set of blocks
   if (current_window_width > 0) {
-    build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height);
+    f(current_window_start_col, (int)column_sizes.size() - 1, window_height);
   }
-
-  return block_infos;
 }
 
+struct row_size_functor {
+  size_type _fixed_width_size_per_row;
+  size_type _num_columns;
+  row_size_functor(size_t fixed_width_size_per_row, size_t num_columns)
+      : _fixed_width_size_per_row(fixed_width_size_per_row), _num_columns(num_columns){};
+
+  CUDA_DEVICE_CALLABLE
+  int operator()(int row_index) {
+    auto const bytes_needed =
+        _fixed_width_size_per_row + util::div_rounding_up_safe<size_type>(_num_columns, 8);
+    return detail::align_offset(bytes_needed, 8);
+  }
+};
+
 #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
 } // namespace detail
@@ -1242,9 +1442,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
                                                            rmm::cuda_stream_view stream,
                                                            rmm::mr::device_memory_resource *mr) {
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-  // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the
-  // data, but small enough that multiple columns fit in memory so the writes can coalese as well.
-  // Potential optimization for window sizes.
   const size_type num_columns = tbl.num_columns();
   const size_type num_rows = tbl.num_rows();
 
@@ -1253,7 +1450,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   int total_shmem;
   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  // TODO: why?
+  // TODO: why is this needed. kernel fails to launch if all memory is requested.
   total_shmem -= 1024;
   int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
@@ -1277,150 +1474,113 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   std::vector<bitmask_type const *> input_nm;
   input_data.reserve(num_columns);
   input_nm.reserve(num_columns);
-  for (size_type column_number = 0; column_number < num_columns; column_number++) {
-    column_view cv = tbl.column(column_number);
-    auto const col_type = cv.type();
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
-
-    if (!nested_type) {
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-  }
+  std::transform(
+      tbl.begin(), tbl.end(), std::back_inserter(input_data),
+      [](cudf::column_view const &c) -> int8_t const * { return c.template data<int8_t>(); });
+  std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_nm),
+                 [](auto c) { return c.null_mask(); });
 
   auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
   auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
 
-  std::vector<size_type> row_sizes;     // size of each row in bytes including any alignment padding
-  std::vector<size_type> row_offsets;   // offset from the start of the data to this row
   std::vector<size_type> column_sizes;  // byte size of each column
   std::vector<size_type> column_starts; // offset of column inside a row including alignment
-  std::vector<column_view>
-      variable_width_columns; // list of the variable width columns in the table
-  row_sizes.reserve(num_rows);
-  row_offsets.reserve(num_rows);
   column_sizes.reserve(num_columns);
   column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
 
-  auto iter =
+  auto schema_column_iter =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                                       [&tbl](auto i) -> std::tuple<data_type, column_view const> {
                                         return std::make_tuple(tbl.column(i).type(), tbl.column(i));
                                       });
 
-  size_type fixed_width_size_per_row =
-      detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
+  size_type fixed_width_size_per_row = detail::compute_column_information(
+      schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes);
 
   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
-  std::vector<detail::row_batch> row_batches;
-
-  uint64_t row_batch_size = 0;
-  uint64_t total_table_size = 0;
-  size_type row_batch_rows = 0;
-  uint64_t row_offset = 0;
+  // total encoded row size. This includes fixed-width data, validity, and variable-width data.
+  auto row_size_iter = cudf::detail::make_counting_transform_iterator(
+      0, detail::row_size_functor(fixed_width_size_per_row, num_columns));
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
   auto validity_size = num_bitmask_words(num_columns) * 4;
-  // thrust
-  for (int row = 0; row < num_rows; ++row) {
-    auto aligned_row_batch_size =
-        detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned
-    row_sizes[row] = fixed_width_size_per_row;
-    // validity is byte aligned
-    row_sizes[row] += validity_size;
-    // variable width data is 8-byte aligned
-    row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned
-
-    if ((uint64_t)aligned_row_batch_size + row_sizes[row] >
-        (uint64_t)std::numeric_limits<size_type>::max()) {
-      // a new batch starts at the last 32-row boundary
-      row_batches.push_back(
-          detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows & ~31});
-      row_batch_size = 0;
-      row_batch_rows = row_batch_rows & 31;
-      row_offset = 0;
-      aligned_row_batch_size = 0;
-    }
-    row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned
-    row_offsets.push_back(row_offset);
-    row_batch_size = aligned_row_batch_size + row_sizes[row];
-    row_offset += row_sizes[row];
-    total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned
-    total_table_size += row_sizes[row];
-    row_batch_rows++;
-  }
-  if (row_batch_size > 0) {
-    row_batches.push_back(
-        detail::row_batch{static_cast<size_type>(row_batch_size), row_batch_rows});
-  }
 
-  auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr);
+  auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr);
+  auto gpu_batch_row_boundaries =
+      make_device_uvector_async(batch_info.batch_row_boundaries, stream);
+
+  // the first batch always exists unless we were sent an empty table
+  auto const first_batch_size = batch_info.row_batches[0].row_count;
 
   std::vector<rmm::device_buffer> output_buffers;
   std::vector<int8_t *> output_data;
-  output_data.reserve(row_batches.size());
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr);
+  output_data.reserve(batch_info.row_batches.size());
+  for (uint i = 0; i < batch_info.row_batches.size(); ++i) {
+    rmm::device_buffer temp(batch_info.row_batches[i].num_bytes, stream, mr);
     output_data.push_back(static_cast<int8_t *>(temp.data()));
     output_buffers.push_back(std::move(temp));
   }
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
-  std::vector<detail::block_info> block_infos =
-      build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream);
+  int info_count = 0;
+  detail::determine_windows(
+      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block,
+      [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col,
+                                                        int const window_height) {
+        int i = detail::compute_block_counts(gpu_batch_row_boundaries, window_height, stream);
+        info_count += i;
+      });
+
+  // allocate space for blocks
+  device_uvector<detail::block_info> gpu_block_infos(info_count, stream);
+  int block_offset = 0;
+
+  detail::determine_windows(
+      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block,
+      [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset,
+       stream](int const start_col, int const end_col, int const window_height) {
+        block_offset += detail::build_blocks(
+            {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset},
+            gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream);
+      });
 
   // blast through the entire table and convert it
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
+  dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
   dim3 threads(256);
 
   detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(),
-      dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(),
+      num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(),
+      dev_col_sizes.data(), dev_col_starts.data(), batch_info.input_data_row_offsets.data(),
       reinterpret_cast<int8_t **>(dev_output_data.data()));
 
-  auto validity_block_infos =
-      build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+  auto validity_block_infos = detail::build_validity_block_infos(
+      num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches);
 
   auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream);
   dim3 validity_blocks(
       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
   detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(),
-      column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(),
-      dev_input_nm.data());
+      num_rows, num_columns, shmem_limit_per_block, batch_info.input_data_row_offsets.data(),
+      dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data());
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
-  int offset_offset = 0;
   std::vector<std::unique_ptr<cudf::column>> ret;
-  for (uint i = 0; i < row_batches.size(); ++i) {
-    // compute offsets for this row batch
-    std::vector<size_type> offset_vals;
-    offset_vals.reserve(row_batches[i].row_count + 1);
-    size_type cur_offset = 0;
-    offset_vals.push_back(cur_offset);
-    for (int row = 0; row < row_batches[i].row_count; ++row) {
-      cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset];
-      offset_vals.push_back(cur_offset);
-    }
-    offset_offset += row_batches[i].row_count;
-
-    auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr);
-    auto offsets = std::make_unique<column>(data_type{type_id::INT32},
-                                            (size_type)offset_vals.size(), dev_offsets.release());
-
-    auto data = std::make_unique<column>(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes,
-                                         std::move(output_buffers[i]));
-
-    ret.push_back(
-        cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0,
-                                rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
+  for (int batch = 0; batch < (int)batch_info.row_batches.size(); ++batch) {
+    auto const offset_count = batch_info.row_batches[batch].row_offsets.size();
+    auto offsets = std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_count,
+                                            batch_info.row_batches[batch].row_offsets.release());
+    auto data =
+        std::make_unique<column>(data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes,
+                                 std::move(output_buffers[batch]));
+
+    ret.push_back(cudf::make_lists_column(
+        batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), 0,
+        rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
   }
 
   return ret;
@@ -1437,7 +1597,8 @@ convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_str
 
   std::vector<cudf::data_type> schema;
   schema.resize(num_columns);
-  std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type);
+  std::transform(tbl.begin(), tbl.end(), schema.begin(),
+                 [](auto i) -> cudf::data_type { return i.type(); });
 
   if (detail::are_all_fixed_width(schema)) {
     std::vector<cudf::size_type> column_start;
@@ -1509,7 +1670,7 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   int total_shmem;
   CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  // TODO why?
+  // TODO: why is this needed. kernel fails to launch if all memory is requested.
   total_shmem -= 1024;
   int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
@@ -1519,8 +1680,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
     return std::make_tuple(schema[i], nullptr);
   });
-  size_type fixed_width_size_per_row = detail::compute_column_information(
-      iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {});
+  size_type fixed_width_size_per_row =
+      detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
 
   size_type validity_size = num_bitmask_words(num_columns) * 4;
 
@@ -1534,8 +1695,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
-
-  row_batches.push_back(detail::row_batch{child.size(), num_rows});
+  row_batches.push_back(
+      {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
 
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<cudf::column>> output_columns;
@@ -1553,45 +1714,48 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
   auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
 
-  std::vector<detail::block_info> block_infos =
-      build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block);
-
-  auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr);
-
-  dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
+  // only ever get a single batch when going from rows, so boundaries
+  // are 0, num_rows
+  device_uvector<size_type> gpu_batch_row_boundaries(2, stream);
+
+  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(2), gpu_batch_row_boundaries.begin(),
+                    [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; });
+
+  int info_count = 0;
+  detail::determine_windows(column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block,
+                            [&gpu_batch_row_boundaries, &info_count, &stream](
+                                int const start_col, int const end_col, int const window_height) {
+                              info_count += detail::compute_block_counts(gpu_batch_row_boundaries,
+                                                                         window_height, stream);
+                            });
+
+  // allocate space for blocks
+  device_uvector<detail::block_info> gpu_block_infos(info_count, stream);
+
+  int block_offset = 0;
+  detail::determine_windows(
+      column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block,
+      [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset,
+       stream](int const start_col, int const end_col, int const window_height) {
+        block_offset += detail::build_blocks(
+            {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset},
+            gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream);
+      });
+
+  dim3 blocks(
+      util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
   detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(),
-      block_infos.size(), child.data<int8_t>());
+      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos,
+      child.data<int8_t>());
+
+  auto validity_block_infos =
+      detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+
+  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream);
 
-  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-  auto const column_stride = [&]() {
-    if (desired_rows_and_columns > num_columns) {
-      // not many columns, group it into 64s and ship it off
-      return std::min(64, num_columns);
-    } else {
-      return util::round_down_safe(desired_rows_and_columns, 8);
-    }
-  }();
-  auto const row_stride = [&]() {
-    // we fit as much as we can, we know the column stride now, so calculate the row
-    return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32));
-    /*    if (desired_rows_and_columns > num_rows) {
-          return std::min(32, num_rows);
-        } else {
-          return util::round_down_safe(desired_rows_and_columns, 32);
-        }*/
-  }();
-  std::vector<detail::block_info> validity_block_infos;
-  for (int col = 0; col < num_columns; col += column_stride) {
-    for (int row = 0; row < num_rows; row += row_stride) {
-      validity_block_infos.emplace_back(
-          detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
-                             std::min(row + row_stride - 1, num_rows - 1)});
-    }
-  }
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr);
   dim3 validity_blocks(
       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
 
@@ -1599,8 +1763,8 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   detail::
       copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-          dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(),
-          validity_block_infos.size(), child.data<int8_t>());
+          dev_output_nm.data(), column_starts.back(), dev_validity_block_infos,
+          child.data<int8_t>());
 
   return std::make_unique<cudf::table>(std::move(output_columns));
 #else
@@ -1665,4 +1829,6 @@ std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
   }
 }
 
+} // namespace java
+
 } // namespace cudf

From 630222a841470848141cb57646350420c5e05452 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 4 Nov 2021 00:02:24 +0000
Subject: [PATCH 65/80] fixing overflow issues with large tables

---
 java/src/main/native/src/row_conversion.cu | 202 +++++++++++----------
 1 file changed, 110 insertions(+), 92 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 87ab1ed49d8..c5bbed5274c 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -15,6 +15,8 @@
  */
 
 #include <algorithm>
+#include <cstdarg>
+#include <cstdint>
 #include <iostream>
 #include <iterator>
 #include <limits>
@@ -25,6 +27,8 @@
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
+#include "thrust/scan.h"
+
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 #include <cuda/barrier>
 #endif
@@ -50,7 +54,6 @@
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/scan.h>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;
@@ -336,7 +339,7 @@ struct block_info {
   int start_row;
   int end_col;
   int end_row;
-  int buffer_num;
+  int batch_number;
 
   __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
                                                     size_type const *const col_sizes) const {
@@ -369,7 +372,7 @@ struct row_batch {
  * @param input_data pointer to raw table data
  * @param col_sizes array of sizes for each element in a column - one per column
  * @param col_offsets offset into input data row for each column's start
- * @param row_offsets offset to a specific row in the input data
+ * @param row_offsets offset to a specific row in the output data
  * @param output_data pointer to output data
  *
  */
@@ -470,7 +473,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
     auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
-    auto const block_output_buffer = output_data[block.buffer_num];
+    auto const block_output_buffer = output_data[block.batch_number];
 
     // copy entire rows to final dest
     for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
@@ -496,7 +499,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
  * @param shmem_used_per_block amount of shared memory that is used by a block
- * @param row_offsets offset to a specific row in the input data
+ * @param row_offsets offset to a specific row in the output data
  * @param output_data pointer to output data, partitioned by data size
  * @param validity_offsets offset into input data row for validity data
  * @param block_infos information about the blocks of work
@@ -610,7 +613,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
     group.sync();
 
     auto const output_data_base =
-        output_data[block.buffer_num] + validity_offset + block.start_col / 8;
+        output_data[block.batch_number] + validity_offset + block.start_col / 8;
 
     // now async memcpy the shared memory out to the final destination
     for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
@@ -1176,12 +1179,18 @@ constexpr size_type max_batch_size = std::numeric_limits<size_type>::max();
  *
  */
 struct batch_data {
+  device_uvector<size_type> batch_row_offsets;
   std::vector<size_type> batch_row_boundaries;
-  device_uvector<size_type> input_data_row_offsets;
   std::vector<row_batch> row_batches;
+};
 
-  batch_data(size_type num_input_offsets, rmm::cuda_stream_view stream)
-      : input_data_row_offsets(num_input_offsets, stream){};
+template <typename RowSize> struct row_size_functor {
+  RowSize _row_sizes;
+  size_type _num_rows;
+  row_size_functor(RowSize row_sizes) : _row_sizes(row_sizes){};
+
+  CUDA_DEVICE_CALLABLE
+  uint64_t operator()(int row_index) { return static_cast<uint64_t>(_row_sizes[row_index]); }
 };
 
 /**
@@ -1199,19 +1208,26 @@ struct batch_data {
 template <typename RowSize>
 batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream,
                          rmm::mr::device_memory_resource *mr) {
-  auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
-  auto const num_batches = util::div_rounding_up_safe(total_size, max_batch_size);
+  auto uint64_row_sizes =
+      cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes));
+  auto const total_size =
+      thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows);
+  auto const num_batches = static_cast<int32_t>(
+      util::div_rounding_up_safe(total_size, static_cast<uint64_t>(max_batch_size)));
   auto const num_offsets = num_batches + 1;
-  batch_data ret(num_rows + 1, stream);
+  std::vector<row_batch> row_batches;
+  std::vector<size_type> batch_row_boundaries;
+  device_uvector<size_type> batch_row_offsets(num_rows, stream);
 
   // at most max gpu memory / 2GB iterations.
-  ret.batch_row_boundaries.reserve(num_offsets);
-  ret.batch_row_boundaries.push_back(0);
+  batch_row_boundaries.reserve(num_offsets);
+  batch_row_boundaries.push_back(0);
   size_type last_row_end = 0;
-  device_uvector<size_type> cumulative_row_sizes(num_rows, stream);
-  thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
+  device_uvector<uint64_t> cumulative_row_sizes(num_rows, stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows,
                          cumulative_row_sizes.begin());
-  while ((int)ret.batch_row_boundaries.size() < num_offsets) {
+
+  while ((int)batch_row_boundaries.size() < num_offsets) {
     // find the next max_batch_size boundary
     size_type const row_end =
         ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(),
@@ -1220,6 +1236,9 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream
           cumulative_row_sizes.begin()) +
          last_row_end);
 
+    // build offset list for each row in this batch
+    auto const num_rows_in_batch = row_end - last_row_end;
+
     // build offset list for each row in this batch
     auto const num_entries = row_end - last_row_end + 1;
     device_uvector<size_type> output_batch_row_offsets(num_entries, stream, mr);
@@ -1232,44 +1251,44 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream
     thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded,
                            row_size_iter_bounded + num_entries, output_batch_row_offsets.begin());
 
-    ret.batch_row_boundaries.push_back(row_end);
-    auto const batch_bytes = output_batch_row_offsets.element(row_end, stream) -
-                             output_batch_row_offsets.element(last_row_end, stream);
-    auto const num_rows_in_batch = row_end - last_row_end;
-    ret.row_batches.push_back(
-        {batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)});
+    auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream);
+
+    // The output_batch_row_offsets vector is used as the offset column of the returned data. This
+    // needs to be individually allocated, but the kernel needs a contiguous array of offsets or
+    // more global lookups are necessary.
+    cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(),
+               num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice);
+
+    batch_row_boundaries.push_back(row_end);
+    row_batches.push_back({batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)});
+
     last_row_end = row_end;
   }
 
-  auto row_size_iter = cudf::detail::make_counting_transform_iterator(
-      0, [row_sizes, num_rows] __device__(auto i) { return (i < num_rows) ? row_sizes[i] : 0; });
-  thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter, row_size_iter + num_rows + 1,
-                         ret.input_data_row_offsets.begin());
-
-  return ret;
+  return {std::move(batch_row_offsets), batch_row_boundaries, std::move(row_batches)};
 }
 
 /**
  * @brief Computes the number of blocks necessary given a window height and batch offsets
  *
- * @param batch_row_offsets row offsets for each batch
+ * @param batch_row_boundaries row boundaries for each batch
  * @param desired_window_height height of each window in the table
  * @param stream stream to use
  * @return number of windows necessary
  */
-int compute_block_counts(device_span<size_type const> const &batch_row_offsets,
+int compute_block_counts(device_span<size_type const> const &batch_row_boundaries,
                          int desired_window_height, rmm::cuda_stream_view stream) {
-  size_type const num_batches = batch_row_offsets.size() - 1;
+  size_type const num_batches = batch_row_boundaries.size() - 1;
   device_uvector<size_type> num_blocks(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-      rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
-      [desired_window_height,
-       batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
-        return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] -
-                                                batch_row_offsets[batch_index],
-                                            desired_window_height);
-      });
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
+                    [desired_window_height,
+                     batch_row_boundaries =
+                         batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
+                      return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
+                                                              batch_row_boundaries[batch_index],
+                                                          desired_window_height);
+                    });
   return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
 }
 
@@ -1277,7 +1296,7 @@ int compute_block_counts(device_span<size_type const> const &batch_row_offsets,
  * @brief Builds the `block_info` structs for a given table.
  *
  * @param blocks span of blocks to populate
- * @param batch_row_offsets offsets to row batches
+ * @param batch_row_boundaries boundary to row batches
  * @param column_start starting column of the window
  * @param column_end ending column of the window
  * @param desired_window_height height of the window
@@ -1287,20 +1306,20 @@ int compute_block_counts(device_span<size_type const> const &batch_row_offsets,
  */
 size_type
 build_blocks(device_span<block_info> blocks,
-             device_uvector<size_type> const &batch_row_offsets, // comes from build_batches
+             device_uvector<size_type> const &batch_row_boundaries, // comes from build_batches
              int column_start, int column_end, int desired_window_height, int total_number_of_rows,
              rmm::cuda_stream_view stream) {
-  size_type const num_batches = batch_row_offsets.size() - 1;
+  size_type const num_batches = batch_row_boundaries.size() - 1;
   device_uvector<size_type> num_blocks(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-      rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
-      [desired_window_height,
-       batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type {
-        return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] -
-                                                batch_row_offsets[batch_index],
-                                            desired_window_height);
-      });
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
+                    [desired_window_height,
+                     batch_row_boundaries =
+                         batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
+                      return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
+                                                              batch_row_boundaries[batch_index],
+                                                          desired_window_height);
+                    });
 
   size_type const total_blocks =
       thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
@@ -1316,7 +1335,7 @@ build_blocks(device_span<block_info> blocks,
   thrust::transform(
       rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(),
       [=, block_starts = block_starts.data(),
-       batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) {
+       batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type block_index) {
         // what batch this block falls in
         auto const batch_index_iter =
             thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index);
@@ -1324,14 +1343,15 @@ build_blocks(device_span<block_info> blocks,
         // local index within the block
         int const local_block_index = block_index - block_starts[batch_index];
         // the start row for this batch.
-        int const batch_row_start = batch_row_offsets[batch_index];
+        int const batch_row_start = batch_row_boundaries[batch_index];
         // the start row for this block
         int const block_row_start = batch_row_start + (local_block_index * desired_window_height);
         // the end row for this block
-        int const max_row = std::min(total_number_of_rows - 1,
-                                     batch_index + 1 > num_batches ?
-                                         std::numeric_limits<int>::max() :
-                                         static_cast<int>(batch_row_offsets[batch_index + 1]) - 1);
+        int const max_row =
+            std::min(total_number_of_rows - 1,
+                     batch_index + 1 > num_batches ?
+                         std::numeric_limits<int>::max() :
+                         static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
         int const block_row_end = std::min(
             batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row);
 
@@ -1420,20 +1440,6 @@ void determine_windows(std::vector<size_type> const &column_sizes,
   }
 }
 
-struct row_size_functor {
-  size_type _fixed_width_size_per_row;
-  size_type _num_columns;
-  row_size_functor(size_t fixed_width_size_per_row, size_t num_columns)
-      : _fixed_width_size_per_row(fixed_width_size_per_row), _num_columns(num_columns){};
-
-  CUDA_DEVICE_CALLABLE
-  int operator()(int row_index) {
-    auto const bytes_needed =
-        _fixed_width_size_per_row + util::div_rounding_up_safe<size_type>(_num_columns, 8);
-    return detail::align_offset(bytes_needed, 8);
-  }
-};
-
 #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
 } // namespace detail
@@ -1502,7 +1508,11 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
   // total encoded row size. This includes fixed-width data, validity, and variable-width data.
   auto row_size_iter = cudf::detail::make_counting_transform_iterator(
-      0, detail::row_size_functor(fixed_width_size_per_row, num_columns));
+      0, [fixed_width_size_per_row, num_columns] __device__(auto i) {
+        auto const bytes_needed =
+            fixed_width_size_per_row + util::div_rounding_up_safe<size_type>(num_columns, 8);
+        return detail::align_offset(bytes_needed, 8);
+      });
 
   // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
   // calculate the size of each row's variable-width data and validity as well.
@@ -1518,11 +1528,14 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   std::vector<rmm::device_buffer> output_buffers;
   std::vector<int8_t *> output_data;
   output_data.reserve(batch_info.row_batches.size());
-  for (uint i = 0; i < batch_info.row_batches.size(); ++i) {
-    rmm::device_buffer temp(batch_info.row_batches[i].num_bytes, stream, mr);
-    output_data.push_back(static_cast<int8_t *>(temp.data()));
-    output_buffers.push_back(std::move(temp));
-  }
+  output_buffers.reserve(batch_info.row_batches.size());
+  std::transform(batch_info.row_batches.begin(), batch_info.row_batches.end(),
+                 std::back_inserter(output_buffers), [&](auto const &batch) {
+                   return rmm::device_buffer(batch.num_bytes, stream, mr);
+                 });
+  std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data),
+                 [](auto &buf) { return static_cast<int8_t *>(buf.data()); });
+
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   int info_count = 0;
@@ -1551,11 +1564,6 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
   dim3 threads(256);
 
-  detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(),
-      dev_col_sizes.data(), dev_col_starts.data(), batch_info.input_data_row_offsets.data(),
-      reinterpret_cast<int8_t **>(dev_output_data.data()));
-
   auto validity_block_infos = detail::build_validity_block_infos(
       num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches);
 
@@ -1563,8 +1571,16 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   dim3 validity_blocks(
       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+
+  detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(),
+      dev_col_sizes.data(), dev_col_starts.data(),
+      batch_info.batch_row_offsets
+          .data(), // needs to be row offsets per batch, not overall JUST for output.
+      reinterpret_cast<int8_t **>(dev_output_data.data()));
+
   detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, batch_info.input_data_row_offsets.data(),
+      num_rows, num_columns, shmem_limit_per_block, batch_info.batch_row_offsets.data(),
       dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data());
 
   // split up the output buffer into multiple buffers based on row batch sizes
@@ -1693,11 +1709,6 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
 
-  // build the row_batches from the passed in list column
-  std::vector<detail::row_batch> row_batches;
-  row_batches.push_back(
-      {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
-
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<cudf::column>> output_columns;
   std::vector<int8_t *> output_data;
@@ -1711,6 +1722,11 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
     output_columns.emplace_back(std::move(column));
   }
 
+  // build the row_batches from the passed in list column
+  std::vector<detail::row_batch> row_batches;
+  row_batches.push_back(
+      {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
+
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
   auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
 
@@ -1746,10 +1762,6 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   dim3 blocks(
       util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
   dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
-  detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos,
-      child.data<int8_t>());
 
   auto validity_block_infos =
       detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
@@ -1760,6 +1772,12 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
       util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
 
   dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+
+  detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
+      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos,
+      child.data<int8_t>());
+
   detail::
       copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
           num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),

From 5e66d7ce21ebf7d8ea0f75d47bad70ad0f29e9a5 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 9 Nov 2021 03:50:24 +0000
Subject: [PATCH 66/80] fixing includes for java

---
 java/src/main/native/src/row_conversion.cu | 26 ++++++++++++----------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index c5bbed5274c..f9cb61f4ea1 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -23,16 +23,24 @@
 #include <tuple>
 
 #include <cooperative_groups.h>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_device_view.cuh>
 #include <type_traits>
 
-#include "thrust/scan.h"
-
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 #include <cuda/barrier>
 #endif
 
+#include <thrust/scan.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/sequence.hpp>
@@ -40,20 +48,14 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
-#include <cudf/row_conversion.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
+
+#include "row_conversion.hpp"
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;

From 37feaa1a90107cac32bdd3c5cbc02b17da2ffb9e Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 16 Nov 2021 07:03:37 +0000
Subject: [PATCH 67/80] updating from review comments

---
 .../cudf/detail/utilities/integer_utils.hpp   |   9 +
 cpp/src/copying/contiguous_split.cu           |   9 +-
 java/src/main/native/src/row_conversion.cu    | 449 +++++++++---------
 3 files changed, 231 insertions(+), 236 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index dc919433da7..48618ae53a1 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -60,6 +60,15 @@ inline S round_down_safe(S number_to_round, S modulus)
   return rounded_down;
 }
 
+template <typename S>
+constexpr inline S round_up_unsafe(S number_to_round, S modulus) noexcept
+{
+  auto remainder = number_to_round % modulus;
+  if (remainder == 0) { return number_to_round; }
+  auto rounded_up = number_to_round - remainder + modulus;
+  return rounded_up;
+}
+
 /**
  * Divides the left-hand-side by the right-hand-side, rounding up
  * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3.
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index a9194ceea93..6467a816542 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -40,13 +40,6 @@ namespace {
 // align all column size allocations to this boundary so that all output column buffers
 // start at that alignment.
 static constexpr std::size_t split_align = 64;
-inline __device__ std::size_t _round_up_safe(std::size_t number_to_round, std::size_t modulus)
-{
-  auto remainder = number_to_round % modulus;
-  if (remainder == 0) { return number_to_round; }
-  auto rounded_up = number_to_round - remainder + modulus;
-  return rounded_up;
-}
 
 /**
  * @brief Struct which contains information on a source buffer.
@@ -953,7 +946,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
       std::size_t const bytes =
         static_cast<std::size_t>(num_elements) * static_cast<std::size_t>(element_size);
 
-      return dst_buf_info{_round_up_safe(bytes, 64),
+      return dst_buf_info{util::round_up_unsafe(bytes, 64ul),
                           num_elements,
                           element_size,
                           num_rows,
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index f9cb61f4ea1..6f3998216b0 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -29,18 +29,6 @@
 #include <cuda/barrier>
 #endif
 
-#include <thrust/scan.h>
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/sequence.hpp>
@@ -54,6 +42,14 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scan.h>
 
 #include "row_conversion.hpp"
 
@@ -65,24 +61,36 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 
 // needed to suppress warning about cuda::barrier
-#pragma diag_suppress static_var_with_dynamic_init
+#pragma nv_diag_suppress static_var_with_dynamic_init
 #endif
 
-using cudf::detail::make_device_uvector_async;
+using namespace cudf;
+using detail::make_device_uvector_async;
 using rmm::device_uvector;
 namespace cudf {
 namespace java {
 namespace detail {
 
-static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) {
-  return (offset + alignment - 1) & ~(alignment - 1);
-}
-
-__global__ void copy_from_rows_fixed_width_optimized(
-    const cudf::size_type num_rows, const cudf::size_type num_columns,
-    const cudf::size_type row_size, const cudf::size_type *input_offset_in_row,
-    const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm,
-    const int8_t *input_data) {
+/**
+ * @brief Copies data from row-base JCUDF format to column-based cudf format.
+ *
+ * This optimized version of the conversion is faster for fixed-width tables
+ * that do not have more than 100 columns.
+ *
+ * @param num_rows number of rows in the incoming table
+ * @param num_columns number of columns in the incoming table
+ * @param row_size length in bytes of each row
+ * @param input_offset_in_row offset to each row of data
+ * @param num_bytes total number of bytes in the incoming data
+ * @param output_data array of pointers to the output data
+ * @param output_nm array of pointers to the output null masks
+ * @param input_data pointing to the incoming row data
+ */
+__global__ void
+copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type num_columns,
+                                     const size_type row_size, const size_type *input_offset_in_row,
+                                     const size_type *num_bytes, int8_t **output_data,
+                                     bitmask_type **output_nm, const int8_t *input_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -97,10 +105,10 @@ __global__ void copy_from_rows_fixed_width_optimized(
   // are controlled by the x dimension (there are multiple blocks in the x
   // dimension).
 
-  cudf::size_type rows_per_group = blockDim.x;
-  cudf::size_type row_group_start = blockIdx.x;
-  cudf::size_type row_group_stride = gridDim.x;
-  cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+  size_type rows_per_group = blockDim.x;
+  size_type row_group_start = blockIdx.x;
+  size_type row_group_stride = gridDim.x;
+  size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
 
   extern __shared__ int8_t shared_data[];
 
@@ -109,28 +117,27 @@ __global__ void copy_from_rows_fixed_width_optimized(
   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
   int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
-  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+  for (size_type row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
     // Step 1: Copy the data into shared memory
     // We know row_size is always aligned with and a multiple of int64_t;
     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
     const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
 
-    cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
-    cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
-    cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
+    size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    size_type shared_output_stride = blockDim.x * blockDim.y;
+    size_type row_index_end = ((row_group_index + 1) * rows_per_group);
     if (row_index_end > num_rows) {
       row_index_end = num_rows;
     }
-    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    cudf::size_type shared_length = row_size * num_rows_in_group;
+    size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    size_type shared_length = row_size * num_rows_in_group;
 
-    cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
+    size_type shared_output_end = shared_length / sizeof(int64_t);
 
-    cudf::size_type start_input_index =
-        (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+    size_type start_input_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
 
-    for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
+    for (size_type shared_index = shared_output_index; shared_index < shared_output_end;
          shared_index += shared_output_stride) {
       long_shared[shared_index] = long_input[start_input_index + shared_index];
     }
@@ -141,17 +148,17 @@ __global__ void copy_from_rows_fixed_width_optimized(
 
     // Within the row group there should be 1 thread for each row.  This is a
     // requirement for launching the kernel
-    cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
+    size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
     // But we might not use all of the threads if the number of rows does not go
     // evenly into the thread count. We don't want those threads to exit yet
     // because we may need them to copy data in for the next row group.
     uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
     if (row_index < num_rows) {
-      cudf::size_type col_index_start = threadIdx.y;
-      cudf::size_type col_index_stride = blockDim.y;
-      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+      size_type col_index_start = threadIdx.y;
+      size_type col_index_stride = blockDim.y;
+      for (size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-        cudf::size_type col_size = num_bytes[col_index];
+        size_type col_size = num_bytes[col_index];
         const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
         int8_t *col_output = output_data[col_index];
         switch (col_size) {
@@ -175,18 +182,18 @@ __global__ void copy_from_rows_fixed_width_optimized(
             break;
           }
           default: {
-            cudf::size_type output_offset = col_size * row_index;
+            size_type output_offset = col_size * row_index;
             // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
+            for (size_type b = 0; b < col_size; b++) {
               col_output[b + output_offset] = col_tmp[b];
             }
             break;
           }
         }
 
-        cudf::bitmask_type *nm = output_nm[col_index];
+        bitmask_type *nm = output_nm[col_index];
         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        size_type byte_bit_offset = col_index % 8;
         int predicate = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask = __ballot_sync(active_mask, predicate);
         if (row_index % 32 == 0) {
@@ -200,10 +207,9 @@ __global__ void copy_from_rows_fixed_width_optimized(
 }
 
 __global__ void copy_to_rows_fixed_width_optimized(
-    const cudf::size_type start_row, const cudf::size_type num_rows,
-    const cudf::size_type num_columns, const cudf::size_type row_size,
-    const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes,
-    const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) {
+    const size_type start_row, const size_type num_rows, const size_type num_columns,
+    const size_type row_size, const size_type *output_offset_in_row, const size_type *num_bytes,
+    const int8_t **input_data, const bitmask_type **input_nm, int8_t *output_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -220,10 +226,10 @@ __global__ void copy_to_rows_fixed_width_optimized(
   // are controlled by the x dimension (there are multiple blocks in the x
   // dimension).
 
-  cudf::size_type rows_per_group = blockDim.x;
-  cudf::size_type row_group_start = blockIdx.x;
-  cudf::size_type row_group_stride = gridDim.x;
-  cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+  size_type rows_per_group = blockDim.x;
+  size_type row_group_start = blockIdx.x;
+  size_type row_group_stride = gridDim.x;
+  size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
 
   extern __shared__ int8_t shared_data[];
 
@@ -233,20 +239,20 @@ __global__ void copy_to_rows_fixed_width_optimized(
   int8_t *row_vld_tmp =
       &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
-  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+  for (size_type row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
     // Within the row group there should be 1 thread for each row.  This is a
     // requirement for launching the kernel
-    cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
+    size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
     // But we might not use all of the threads if the number of rows does not go
     // evenly into the thread count. We don't want those threads to exit yet
     // because we may need them to copy data back out.
     if (row_index < (start_row + num_rows)) {
-      cudf::size_type col_index_start = threadIdx.y;
-      cudf::size_type col_index_stride = blockDim.y;
-      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+      size_type col_index_start = threadIdx.y;
+      size_type col_index_stride = blockDim.y;
+      for (size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-        cudf::size_type col_size = num_bytes[col_index];
+        size_type col_size = num_bytes[col_index];
         int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
         const int8_t *col_input = input_data[col_index];
         switch (col_size) {
@@ -270,9 +276,9 @@ __global__ void copy_to_rows_fixed_width_optimized(
             break;
           }
           default: {
-            cudf::size_type input_offset = col_size * row_index;
+            size_type input_offset = col_size * row_index;
             // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
+            for (size_type b = 0; b < col_size; b++) {
               col_tmp[b] = col_input[b + input_offset];
             }
             break;
@@ -281,10 +287,10 @@ __global__ void copy_to_rows_fixed_width_optimized(
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        size_type byte_bit_offset = col_index % 8;
         uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
-        cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
+        size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
         // Now copy validity for the column
         if (input_nm[col_index]) {
           if (bit_is_set(input_nm[col_index], row_index)) {
@@ -306,21 +312,20 @@ __global__ void copy_to_rows_fixed_width_optimized(
     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
     int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
 
-    cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
-    cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
-    cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
+    size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    size_type shared_input_stride = blockDim.x * blockDim.y;
+    size_type row_index_end = ((row_group_index + 1) * rows_per_group);
     if (row_index_end > num_rows) {
       row_index_end = num_rows;
     }
-    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    cudf::size_type shared_length = row_size * num_rows_in_group;
+    size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    size_type shared_length = row_size * num_rows_in_group;
 
-    cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
+    size_type shared_input_end = shared_length / sizeof(int64_t);
 
-    cudf::size_type start_output_index =
-        (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+    size_type start_output_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
 
-    for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
+    for (size_type shared_index = shared_input_index; shared_index < shared_input_end;
          shared_index += shared_input_stride) {
       long_output[start_output_index + shared_index] = long_shared[shared_index];
     }
@@ -343,13 +348,14 @@ struct block_info {
   int end_row;
   int batch_number;
 
-  __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets,
-                                                    size_type const *const col_sizes) const {
-    return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8);
+  constexpr size_type get_shared_row_size(size_type const *const col_offsets,
+                                          size_type const *const col_sizes) const {
+    return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col],
+                                 8);
   }
-  __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; }
+  constexpr size_type num_cols() const { return end_col - start_col + 1; }
 
-  __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; }
+  constexpr size_type num_rows() const { return end_row - start_row + 1; }
 };
 
 /**
@@ -554,7 +560,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
     auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
     auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
     auto const validity_data_row_length =
-        align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
+        util::round_up_unsafe(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
     auto const total_sections = num_sections_x * num_sections_y;
 
     int const warp_id = threadIdx.x / warp_size;
@@ -736,7 +742,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
       cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
                          &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
       shared_row_offset += col_offset_bytes;
-      shared_row_offset = align_offset(shared_row_offset, 8);
+      shared_row_offset = util::round_up_unsafe(shared_row_offset, 8);
 
       for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
            row <= fetch_block_end_row; row += blockDim.x) {
@@ -764,7 +770,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
     auto shared_col_offsets =
         reinterpret_cast<size_type *>(&shared[processing_index % stages_count][col_size_bytes]);
 
-    auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8);
+    auto const shared_row_offset = util::round_up_unsafe(col_size_bytes + col_offset_bytes, 8);
 
     auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes);
 
@@ -813,11 +819,12 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
  * @param input_data pointer to input data
  *
  */
-__global__ void
-copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
-                        const size_type shmem_used_per_block, const size_type *row_offsets,
-                        cudf::bitmask_type **output_nm, const size_type validity_offset,
-                        device_span<const block_info> block_infos, const int8_t *input_data) {
+__global__ void copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
+                                        const size_type shmem_used_per_block,
+                                        const size_type *row_offsets, bitmask_type **output_nm,
+                                        const size_type validity_offset,
+                                        device_span<const block_info> block_infos,
+                                        const int8_t *input_data) {
   extern __shared__ int8_t shared_data[];
   int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
       shared_data, shared_data + shmem_used_per_block / 2};
@@ -950,10 +957,8 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
  * @param [out] threads the size of the threads for the kernel
  * @return the size in bytes of shared memory needed for each block.
  */
-static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
-                                        const cudf::size_type num_rows,
-                                        const cudf::size_type size_per_row, dim3 &blocks,
-                                        dim3 &threads) {
+static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_type num_rows,
+                                        const size_type size_per_row, dim3 &blocks, dim3 &threads) {
   // We have found speed degrades when a thread handles more than 4 columns.
   // Each block is 2 dimensional. The y dimension indicates the columns.
   // We limit this to 32 threads in the y dimension so we can still
@@ -963,37 +968,29 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
   // in the x dimension because we use atomic operations at the block
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
-  int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4);
-  if (y_block_size > 32) {
-    y_block_size = 32;
-  }
-  int x_possible_block_size = 1024 / y_block_size;
+  int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32);
+  int const x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
   // If someone configures the GPU to only have 16 KB this might not work.
-  int max_shared_size = 48 * 1024;
-  int max_block_size = max_shared_size / size_per_row;
+  int const max_shared_size = 48 * 1024;
   // If we don't have enough shared memory there is no point in having more threads
   // per block that will just sit idle
-  max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
+  auto const max_block_size = std::min(x_possible_block_size, max_shared_size / size_per_row);
   // Make sure that the x dimension is a multiple of 32 this not only helps
   // coalesce memory access it also lets us do a ballot sync for validity to write
   // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
   // dimension is associated with one or more warps, that should correspond to the validity
   // words directly.
-  int block_size = (max_block_size / 32) * 32;
+  int const block_size = (max_block_size / 32) * 32;
   CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
 
-  int num_blocks = (num_rows + block_size - 1) / block_size;
-  if (num_blocks < 1) {
-    num_blocks = 1;
-  } else if (num_blocks > 10240) {
-    // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
-    // but in practice haveing too many can cause some overhead that I don't totally
-    // understand. Playing around with this haveing as little as 600 blocks appears
-    // to be able to saturate memory on V100, so this is an order of magnitude higher
-    // to try and future proof this a bit.
-    num_blocks = 10240;
-  }
+  // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
+  // but in practice haveing too many can cause some overhead that I don't totally
+  // understand. Playing around with this haveing as little as 600 blocks appears
+  // to be able to saturate memory on V100, so this is an order of magnitude higher
+  // to try and future proof this a bit.
+  int const num_blocks = std::clamp((num_rows + block_size - 1) / block_size, 1, 10240);
+
   blocks.x = num_blocks;
   blocks.y = 1;
   blocks.z = 1;
@@ -1009,26 +1006,24 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
  * going from start row and containing the next num_rows.  Most of the parameters passed
  * into this function are common between runs and should be calculated once.
  */
-static std::unique_ptr<cudf::column>
-fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows,
-                            const cudf::size_type num_columns, const cudf::size_type size_per_row,
-                            rmm::device_uvector<cudf::size_type> &column_start,
-                            rmm::device_uvector<cudf::size_type> &column_size,
-                            rmm::device_uvector<const int8_t *> &input_data,
-                            rmm::device_uvector<const cudf::bitmask_type *> &input_nm,
-                            const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row,
-                            rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+static std::unique_ptr<column> fixed_width_convert_to_rows(
+    const size_type start_row, const size_type num_rows, const size_type num_columns,
+    const size_type size_per_row, rmm::device_uvector<size_type> &column_start,
+    rmm::device_uvector<size_type> &column_size, rmm::device_uvector<const int8_t *> &input_data,
+    rmm::device_uvector<const bitmask_type *> &input_nm, const scalar &zero,
+    const scalar &scalar_size_per_row, rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource *mr) {
   int64_t const total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
   CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
 
   // Allocate and set the offsets row for the byte array
-  std::unique_ptr<cudf::column> offsets =
+  std::unique_ptr<column> offsets =
       cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
 
-  std::unique_ptr<cudf::column> data = cudf::make_numeric_column(
-      cudf::data_type(cudf::type_id::INT8), static_cast<cudf::size_type>(total_allocation),
-      cudf::mask_state::UNALLOCATED, stream, mr);
+  std::unique_ptr<column> data =
+      make_numeric_column(data_type(type_id::INT8), static_cast<size_type>(total_allocation),
+                          mask_state::UNALLOCATED, stream, mr);
 
   dim3 blocks;
   dim3 threads;
@@ -1039,13 +1034,13 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty
       start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
       input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
 
-  return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
+  return make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
+                           rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
 }
 
-static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
+static inline bool are_all_fixed_width(std::vector<data_type> const &schema) {
   return std::all_of(schema.begin(), schema.end(),
-                     [](const cudf::data_type &t) { return cudf::is_fixed_width(t); });
+                     [](const data_type &t) { return is_fixed_width(t); });
 }
 
 /**
@@ -1056,18 +1051,18 @@ static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schem
  * @param [out] column_size the size in bytes of the data for each columns in the row.
  * @return the size in bytes each row needs.
  */
-static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
-                                                 std::vector<cudf::size_type> &column_start,
-                                                 std::vector<cudf::size_type> &column_size) {
+static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &schema,
+                                                 std::vector<size_type> &column_start,
+                                                 std::vector<size_type> &column_size) {
   // We guarantee that the start of each column is 64-bit aligned so anything can go
   // there, but to make the code simple we will still do an alignment for it.
   int32_t at_offset = 0;
   for (auto col = schema.begin(); col < schema.end(); col++) {
-    cudf::size_type s = cudf::size_of(*col);
+    size_type s = size_of(*col);
     column_size.emplace_back(s);
     std::size_t allocation_needed = s;
     std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
-    at_offset = align_offset(at_offset, alignment_needed);
+    at_offset = util::round_up_unsafe(at_offset, static_cast<int32_t>(alignment_needed));
     column_start.emplace_back(at_offset);
     at_offset += allocation_needed;
   }
@@ -1075,12 +1070,11 @@ static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> co
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add
   // it in
-  int32_t validity_bytes_needed =
-      (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe<int32_t>(schema.size(), 8);
+  int32_t const validity_bytes_needed = util::div_rounding_up_safe<int32_t>(schema.size(), 8);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
-  return align_offset(at_offset, 8); // 8 bytes (64 bits)
+  return util::round_up_unsafe(at_offset, 8); // 8 bytes (64 bits)
 }
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
@@ -1109,8 +1103,8 @@ static size_type compute_column_information(iterator begin, iterator end,
     auto col_size = nested_type ? 8 : size_of(col_type);
 
     // align size for this type
-    std::size_t const alignment_needed = col_size; // They are the same for fixed width types
-    fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed);
+    size_type const alignment_needed = col_size; // They are the same for fixed width types
+    fixed_width_size_per_row = util::round_up_unsafe(fixed_width_size_per_row, alignment_needed);
     column_starts.push_back(fixed_width_size_per_row);
     column_sizes.push_back(col_size);
     fixed_width_size_per_row += col_size;
@@ -1136,7 +1130,7 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro
                            size_type const &shmem_limit_per_block,
                            std::vector<row_batch> const &row_batches) {
   auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
-  auto const column_stride = align_offset(
+  auto const column_stride = util::round_up_unsafe(
       [&]() {
         if (desired_rows_and_columns > num_columns) {
           // not many columns, group it into 8s and ship it off
@@ -1146,10 +1140,11 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro
         }
       }(),
       8);
+
   // we fit as much as we can given the column stride
   // note that an element in the table takes just 1 bit, but a row with a single
   // element still takes 8 bytes!
-  auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8);
+  auto const bytes_per_row = util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), 8);
   auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
 
   std::vector<detail::block_info> validity_block_infos;
@@ -1210,8 +1205,7 @@ template <typename RowSize> struct row_size_functor {
 template <typename RowSize>
 batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream,
                          rmm::mr::device_memory_resource *mr) {
-  auto uint64_row_sizes =
-      cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes));
+  auto uint64_row_sizes = cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes));
   auto const total_size =
       thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows);
   auto const num_batches = static_cast<int32_t>(
@@ -1413,10 +1407,10 @@ void determine_windows(std::vector<size_type> const &column_sizes,
     auto const col_size = column_sizes[col];
 
     // align size for this type
-    std::size_t alignment_needed = col_size; // They are the same for fixed width types
-    auto row_size_aligned = detail::align_offset(row_size, alignment_needed);
+    auto const alignment_needed = col_size; // They are the same for fixed width types
+    auto row_size_aligned = util::round_up_unsafe(row_size, alignment_needed);
     auto row_size_with_this_col = row_size_aligned + col_size;
-    auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8);
+    auto row_size_with_end_pad = util::round_up_unsafe(row_size_with_this_col, 8);
 
     if (row_size_with_end_pad * window_height +
             calc_admin_data_size(col - current_window_start_col) >
@@ -1425,7 +1419,7 @@ void determine_windows(std::vector<size_type> const &column_sizes,
       f(current_window_start_col, col == 0 ? col : col - 1, window_height);
 
       row_size =
-          detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+          util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
       row_size += col_size; // alignment required for shared memory window boundary to match
                             // alignment of output row
       current_window_start_col = col;
@@ -1446,9 +1440,9 @@ void determine_windows(std::vector<size_type> const &column_sizes,
 
 } // namespace detail
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource *mr) {
+std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource *mr) {
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
   const size_type num_columns = tbl.num_columns();
   const size_type num_rows = tbl.num_rows();
@@ -1482,9 +1476,8 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   std::vector<bitmask_type const *> input_nm;
   input_data.reserve(num_columns);
   input_nm.reserve(num_columns);
-  std::transform(
-      tbl.begin(), tbl.end(), std::back_inserter(input_data),
-      [](cudf::column_view const &c) -> int8_t const * { return c.template data<int8_t>(); });
+  std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_data),
+                 [](column_view const &c) -> int8_t const * { return c.template data<int8_t>(); });
   std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_nm),
                  [](auto c) { return c.null_mask(); });
 
@@ -1502,7 +1495,7 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
                                         return std::make_tuple(tbl.column(i).type(), tbl.column(i));
                                       });
 
-  size_type fixed_width_size_per_row = detail::compute_column_information(
+  auto const fixed_width_size_per_row = detail::compute_column_information(
       schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes);
 
   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
@@ -1513,13 +1506,9 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
       0, [fixed_width_size_per_row, num_columns] __device__(auto i) {
         auto const bytes_needed =
             fixed_width_size_per_row + util::div_rounding_up_safe<size_type>(num_columns, 8);
-        return detail::align_offset(bytes_needed, 8);
+        return util::round_up_unsafe(bytes_needed, 8);
       });
 
-  // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then
-  // calculate the size of each row's variable-width data and validity as well.
-  auto validity_size = num_bitmask_words(num_columns) * 4;
-
   auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr);
   auto gpu_batch_row_boundaries =
       make_device_uvector_async(batch_info.batch_row_boundaries, stream);
@@ -1587,19 +1576,22 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
-  std::vector<std::unique_ptr<cudf::column>> ret;
-  for (int batch = 0; batch < (int)batch_info.row_batches.size(); ++batch) {
-    auto const offset_count = batch_info.row_batches[batch].row_offsets.size();
-    auto offsets = std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_count,
-                                            batch_info.row_batches[batch].row_offsets.release());
-    auto data =
-        std::make_unique<column>(data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes,
-                                 std::move(output_buffers[batch]));
-
-    ret.push_back(cudf::make_lists_column(
-        batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), 0,
-        rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr));
-  }
+  std::vector<std::unique_ptr<column>> ret;
+  auto counting_iter = thrust::make_counting_iterator(0);
+  std::transform(counting_iter, counting_iter + batch_info.row_batches.size(),
+                 std::back_inserter(ret), [&](auto batch) {
+                   auto const offset_count = batch_info.row_batches[batch].row_offsets.size();
+                   auto offsets = std::make_unique<column>(
+                       data_type{type_id::INT32}, (size_type)offset_count,
+                       batch_info.row_batches[batch].row_offsets.release());
+                   auto data = std::make_unique<column>(data_type{type_id::INT8},
+                                                        batch_info.row_batches[batch].num_bytes,
+                                                        std::move(output_buffers[batch]));
+
+                   return make_lists_column(
+                       batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data),
+                       0, rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
+                 });
 
   return ret;
 #else
@@ -1608,55 +1600,55 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
 #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::vector<std::unique_ptr<cudf::column>>
-convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream,
+std::vector<std::unique_ptr<column>>
+convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource *mr) {
-  const cudf::size_type num_columns = tbl.num_columns();
+  auto const num_columns = tbl.num_columns();
 
-  std::vector<cudf::data_type> schema;
+  std::vector<data_type> schema;
   schema.resize(num_columns);
   std::transform(tbl.begin(), tbl.end(), schema.begin(),
-                 [](auto i) -> cudf::data_type { return i.type(); });
+                 [](auto i) -> data_type { return i.type(); });
 
   if (detail::are_all_fixed_width(schema)) {
-    std::vector<cudf::size_type> column_start;
-    std::vector<cudf::size_type> column_size;
+    std::vector<size_type> column_start;
+    std::vector<size_type> column_size;
 
-    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+    int32_t const size_per_row =
+        detail::compute_fixed_width_layout(schema, column_start, column_size);
     auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
     auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
 
-    int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
     // splitting validity at a specific row offset.  This might change in the future.
-    max_rows_per_batch = (max_rows_per_batch / 32) * 32;
+    int32_t const max_rows_per_batch =
+        util::round_down_safe(std::numeric_limits<int>::max() / size_per_row, 32);
 
-    cudf::size_type num_rows = tbl.num_rows();
+    auto const num_rows = tbl.num_rows();
 
     // Get the pointers to the input columnar data ready
     std::vector<const int8_t *> input_data;
-    std::vector<cudf::bitmask_type const *> input_nm;
-    for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
-      cudf::column_view cv = tbl.column(column_number);
+    std::vector<bitmask_type const *> input_nm;
+    for (size_type column_number = 0; column_number < num_columns; column_number++) {
+      column_view cv = tbl.column(column_number);
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
     auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
     auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
 
-    using ScalarType = cudf::scalar_type_t<cudf::size_type>;
-    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    using ScalarType = scalar_type_t<size_type>;
+    auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value());
     zero->set_valid_async(true, stream);
     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
 
-    auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    auto step = make_numeric_scalar(data_type(type_id::INT32), stream.value());
     step->set_valid_async(true, stream);
-    static_cast<ScalarType *>(step.get())
-        ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
+    static_cast<ScalarType *>(step.get())->set_value(static_cast<size_type>(size_per_row), stream);
 
-    std::vector<std::unique_ptr<cudf::column>> ret;
-    for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
-      cudf::size_type row_count = num_rows - row_start;
+    std::vector<std::unique_ptr<column>> ret;
+    for (size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
+      size_type row_count = num_rows - row_start;
       row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
       ret.emplace_back(detail::fixed_width_convert_to_rows(
           row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
@@ -1669,19 +1661,19 @@ convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_str
   }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
-                                               std::vector<cudf::data_type> const &schema,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource *mr) {
+std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
+                                         std::vector<data_type> const &schema,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource *mr) {
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
   // verify that the types are what we expect
-  cudf::column_view child = input.child();
-  cudf::type_id list_type = child.type().id();
-  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+  column_view child = input.child();
+  auto const list_type = child.type().id();
+  CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
                "Only a list of bytes is supported as input");
 
-  cudf::size_type num_columns = schema.size();
-  cudf::size_type num_rows = input.parent().size();
+  auto const num_columns = schema.size();
+  auto const num_rows = input.parent().size();
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
@@ -1692,18 +1684,18 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   total_shmem -= 1024;
   int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
-  std::vector<cudf::size_type> column_starts;
-  std::vector<cudf::size_type> column_sizes;
+  std::vector<size_type> column_starts;
+  std::vector<size_type> column_sizes;
 
   auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
     return std::make_tuple(schema[i], nullptr);
   });
-  size_type fixed_width_size_per_row =
+  auto const fixed_width_size_per_row =
       detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
 
-  size_type validity_size = num_bitmask_words(num_columns) * 4;
+  auto const validity_size = num_bitmask_words(num_columns) * 4;
 
-  size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8);
+  auto const row_size = util::round_up_unsafe(fixed_width_size_per_row + validity_size, 8);
 
   // Ideally we would check that the offsets are all the same, etc. but for now
   // this is probably fine
@@ -1712,12 +1704,12 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
 
   // Allocate the columns we are going to write into
-  std::vector<std::unique_ptr<cudf::column>> output_columns;
+  std::vector<std::unique_ptr<column>> output_columns;
   std::vector<int8_t *> output_data;
-  std::vector<cudf::bitmask_type *> output_nm;
-  for (cudf::size_type i = 0; i < num_columns; i++) {
-    auto column = cudf::make_fixed_width_column(schema[i], num_rows,
-                                                cudf::mask_state::UNINITIALIZED, stream, mr);
+  std::vector<bitmask_type *> output_nm;
+  for (int i = 0; i < static_cast<int>(num_columns); i++) {
+    auto column =
+        make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr);
     auto mut = column->mutable_view();
     output_data.emplace_back(mut.data<int8_t>());
     output_nm.emplace_back(mut.null_mask());
@@ -1786,30 +1778,31 @@ std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &in
           dev_output_nm.data(), column_starts.back(), dev_validity_block_infos,
           child.data<int8_t>());
 
-  return std::make_unique<cudf::table>(std::move(output_columns));
+  return std::make_unique<table>(std::move(output_columns));
 #else
   CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
   return {};
 #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 }
 
-std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
-    cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
+std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
+    lists_column_view const &input, std::vector<data_type> const &schema,
     rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   // verify that the types are what we expect
-  cudf::column_view child = input.child();
-  cudf::type_id list_type = child.type().id();
-  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+  column_view child = input.child();
+  auto const list_type = child.type().id();
+  CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
                "Only a list of bytes is supported as input");
 
-  cudf::size_type num_columns = schema.size();
+  auto const num_columns = schema.size();
 
   if (detail::are_all_fixed_width(schema)) {
-    std::vector<cudf::size_type> column_start;
-    std::vector<cudf::size_type> column_size;
+    std::vector<size_type> column_start;
+    std::vector<size_type> column_size;
 
-    cudf::size_type num_rows = input.parent().size();
-    int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
+    auto const num_rows = input.parent().size();
+    int32_t const size_per_row =
+        detail::compute_fixed_width_layout(schema, column_start, column_size);
 
     // Ideally we would check that the offsets are all the same, etc. but for now
     // this is probably fine
@@ -1819,12 +1812,12 @@ std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
     auto dev_column_size = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
-    std::vector<std::unique_ptr<cudf::column>> output_columns;
+    std::vector<std::unique_ptr<column>> output_columns;
     std::vector<int8_t *> output_data;
-    std::vector<cudf::bitmask_type *> output_nm;
-    for (cudf::size_type i = 0; i < num_columns; i++) {
-      auto column = cudf::make_fixed_width_column(schema[i], num_rows,
-                                                  cudf::mask_state::UNINITIALIZED, stream, mr);
+    std::vector<bitmask_type *> output_nm;
+    for (int i = 0; i < static_cast<int>(num_columns); i++) {
+      auto column =
+          make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr);
       auto mut = column->mutable_view();
       output_data.emplace_back(mut.data<int8_t>());
       output_nm.emplace_back(mut.null_mask());
@@ -1843,7 +1836,7 @@ std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
         num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
         dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
 
-    return std::make_unique<cudf::table>(std::move(output_columns));
+    return std::make_unique<table>(std::move(output_columns));
   } else {
     CUDF_FAIL("Only fixed width types are currently supported");
   }

From 27d44d908ab27aed900fac2b0d7e9d0ad5498ea7 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Wed, 17 Nov 2021 19:20:37 +0000
Subject: [PATCH 68/80] Updating from review comments

---
 java/src/main/native/src/row_conversion.cu | 166 +++++++++++----------
 1 file changed, 85 insertions(+), 81 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 6f3998216b0..b7840da9b30 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -60,6 +60,9 @@ constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
 constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
 
+constexpr auto JCUDF_ROW_ALIGNMENT = 8;
+constexpr auto MAX_BATCH_SIZE = std::numeric_limits<cudf::size_type>::max();
+
 // needed to suppress warning about cuda::barrier
 #pragma nv_diag_suppress static_var_with_dynamic_init
 #endif
@@ -105,10 +108,10 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
   // are controlled by the x dimension (there are multiple blocks in the x
   // dimension).
 
-  size_type rows_per_group = blockDim.x;
-  size_type row_group_start = blockIdx.x;
-  size_type row_group_stride = gridDim.x;
-  size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+  size_type const rows_per_group = blockDim.x;
+  size_type const row_group_start = blockIdx.x;
+  size_type const row_group_stride = gridDim.x;
+  size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
 
   extern __shared__ int8_t shared_data[];
 
@@ -117,25 +120,22 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
   int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
-  for (size_type row_group_index = row_group_start; row_group_index < row_group_end;
+  for (auto row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
     // Step 1: Copy the data into shared memory
     // We know row_size is always aligned with and a multiple of int64_t;
     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-    const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
+    int64_t const *long_input = reinterpret_cast<int64_t const *>(input_data);
 
-    size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
-    size_type shared_output_stride = blockDim.x * blockDim.y;
-    size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-    if (row_index_end > num_rows) {
-      row_index_end = num_rows;
-    }
-    size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    size_type shared_length = row_size * num_rows_in_group;
+    auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    auto const shared_output_stride = blockDim.x * blockDim.y;
+    auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group));
+    auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    auto const shared_length = row_size * num_rows_in_group;
 
-    size_type shared_output_end = shared_length / sizeof(int64_t);
+    size_type const shared_output_end = shared_length / sizeof(int64_t);
 
-    size_type start_input_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+    auto const start_input_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
 
     for (size_type shared_index = shared_output_index; shared_index < shared_output_end;
          shared_index += shared_output_stride) {
@@ -148,18 +148,18 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
 
     // Within the row group there should be 1 thread for each row.  This is a
     // requirement for launching the kernel
-    size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
+    auto const row_index = (row_group_index * rows_per_group) + threadIdx.x;
     // But we might not use all of the threads if the number of rows does not go
     // evenly into the thread count. We don't want those threads to exit yet
     // because we may need them to copy data in for the next row group.
     uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
     if (row_index < num_rows) {
-      size_type col_index_start = threadIdx.y;
-      size_type col_index_stride = blockDim.y;
-      for (size_type col_index = col_index_start; col_index < num_columns;
+      auto const col_index_start = threadIdx.y;
+      auto const col_index_stride = blockDim.y;
+      for (auto col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-        size_type col_size = num_bytes[col_index];
-        const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
+        auto const col_size = num_bytes[col_index];
+        int8_t const *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
         int8_t *col_output = output_data[col_index];
         switch (col_size) {
           case 1: {
@@ -182,9 +182,9 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
             break;
           }
           default: {
-            size_type output_offset = col_size * row_index;
+            auto const output_offset = col_size * row_index;
             // TODO this should just not be supported for fixed width columns, but just in case...
-            for (size_type b = 0; b < col_size; b++) {
+            for (auto b = 0; b < col_size; b++) {
               col_output[b + output_offset] = col_tmp[b];
             }
             break;
@@ -351,7 +351,7 @@ struct block_info {
   constexpr size_type get_shared_row_size(size_type const *const col_offsets,
                                           size_type const *const col_sizes) const {
     return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col],
-                                 8);
+                                 JCUDF_ROW_ALIGNMENT);
   }
   constexpr size_type num_cols() const { return end_col - start_col + 1; }
 
@@ -365,9 +365,21 @@ struct block_info {
  *
  */
 struct row_batch {
-  size_type num_bytes;
-  size_type row_count;
-  device_uvector<size_type> row_offsets;
+  size_type num_bytes;                   // number of bytes in this batch
+  size_type row_count;                   // number of rows in the batch
+  device_uvector<size_type> row_offsets; // offsets column of output cudf column
+};
+
+/**
+ * @brief Holds information about the batches of data to be processed
+ *
+ */
+struct batch_data {
+  device_uvector<size_type>
+      batch_row_offsets; // offsets to each row in the JCUDF data from batch start
+  std::vector<size_type>
+      batch_row_boundaries;           // row numbers for different batches like 0, 10000, 20000
+  std::vector<row_batch> row_batches; // information about each batch such as byte count
 };
 
 /**
@@ -412,8 +424,8 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
   group.sync();
 
   auto const blocks_remaining =
-      std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
-               (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS);
+      std::min(static_cast<uint>(block_infos.size()) - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
+               static_cast<uint>(NUM_BLOCKS_PER_KERNEL_TO_ROWS));
 
   size_t fetch;
   size_t subset;
@@ -441,7 +453,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
       // and do row-based memcopies out.
 
       auto const shared_buffer_base = shared[fetch % stages_count];
-      for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) {
+      for (auto el = static_cast<int>(threadIdx.x); el < num_elements_in_block; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
         auto const relative_row = el % num_fetch_rows;
         auto const absolute_col = relative_col + fetch_block.start_col;
@@ -533,8 +545,8 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
   auto group = cooperative_groups::this_thread_block();
 
   int const blocks_remaining =
-      std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-               (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+      std::min(static_cast<uint>(block_infos.size()) - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+               static_cast<uint>(NUM_VALIDITY_BLOCKS_PER_KERNEL));
 
   __shared__ cuda::barrier<cuda::thread_scope_block>
       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
@@ -560,7 +572,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
     auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
     auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
     auto const validity_data_row_length =
-        util::round_up_unsafe(util::div_rounding_up_unsafe(num_block_cols, 8), 8);
+        util::round_up_unsafe(util::div_rounding_up_unsafe(num_block_cols, 8), JCUDF_ROW_ALIGNMENT);
     auto const total_sections = num_sections_x * num_sections_y;
 
     int const warp_id = threadIdx.x / warp_size;
@@ -705,8 +717,8 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
   group.sync();
 
   auto blocks_remaining =
-      std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
-               (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS);
+      std::min(static_cast<uint>(block_infos.size()) - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
+               static_cast<uint>(NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
 
   size_t fetch_index;
   size_t processing_index;
@@ -838,8 +850,8 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ
   auto group = cooperative_groups::this_thread_block();
 
   int const blocks_remaining =
-      std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-               (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL);
+      std::min(static_cast<uint>(block_infos.size()) - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
+               static_cast<uint>(NUM_VALIDITY_BLOCKS_PER_KERNEL));
 
   __shared__ cuda::barrier<cuda::thread_scope_block>
       shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
@@ -862,8 +874,8 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ
     auto const block_start_row = block.start_row;
     auto const num_block_cols = block.num_cols();
     auto const num_block_rows = block.num_rows();
-    auto const num_sections_x = (num_block_cols + 7) / 8;
-    auto const num_sections_y = (num_block_rows + 31) / 32;
+    auto const num_sections_x = util::div_rounding_up_safe(num_block_cols, 8);
+    auto const num_sections_y = util::div_rounding_up_safe(num_block_rows, 32);
     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections = num_sections_x * num_sections_y;
     int const warp_id = threadIdx.x / warp_size;
@@ -1015,7 +1027,8 @@ static std::unique_ptr<column> fixed_width_convert_to_rows(
     rmm::mr::device_memory_resource *mr) {
   int64_t const total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
-  CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
+  CUDF_EXPECTS(total_allocation < std::numeric_limits<size_type>::max(),
+               "Table is too large to fit!");
 
   // Allocate and set the offsets row for the byte array
   std::unique_ptr<column> offsets =
@@ -1074,7 +1087,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &s
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
-  return util::round_up_unsafe(at_offset, 8); // 8 bytes (64 bits)
+  return util::round_up_unsafe(at_offset, JCUDF_ROW_ALIGNMENT);
 }
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
@@ -1096,7 +1109,7 @@ static size_type compute_column_information(iterator begin, iterator end,
   size_type fixed_width_size_per_row = 0;
   for (auto cv = begin; cv != end; ++cv) {
     auto col_type = std::get<0>(*cv);
-    bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING;
+    bool nested_type = is_compound(col_type);
 
     // a list or string column will write a single uint64
     // of data here for offset/length
@@ -1129,7 +1142,7 @@ std::vector<detail::block_info>
 build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
                            size_type const &shmem_limit_per_block,
                            std::vector<row_batch> const &row_batches) {
-  auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block);
+  auto const desired_rows_and_columns = static_cast<int>(sqrt(shmem_limit_per_block));
   auto const column_stride = util::round_up_unsafe(
       [&]() {
         if (desired_rows_and_columns > num_columns) {
@@ -1139,12 +1152,13 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro
           return util::round_down_safe(desired_rows_and_columns, 8);
         }
       }(),
-      8);
+      JCUDF_ROW_ALIGNMENT);
 
   // we fit as much as we can given the column stride
   // note that an element in the table takes just 1 bit, but a row with a single
   // element still takes 8 bytes!
-  auto const bytes_per_row = util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), 8);
+  auto const bytes_per_row =
+      util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), JCUDF_ROW_ALIGNMENT);
   auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
 
   std::vector<detail::block_info> validity_block_infos;
@@ -1169,18 +1183,6 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro
   return validity_block_infos;
 }
 
-constexpr size_type max_batch_size = std::numeric_limits<size_type>::max();
-
-/**
- * @brief Holds information about the batches of data to be processed
- *
- */
-struct batch_data {
-  device_uvector<size_type> batch_row_offsets;
-  std::vector<size_type> batch_row_boundaries;
-  std::vector<row_batch> row_batches;
-};
-
 template <typename RowSize> struct row_size_functor {
   RowSize _row_sizes;
   size_type _num_rows;
@@ -1205,11 +1207,12 @@ template <typename RowSize> struct row_size_functor {
 template <typename RowSize>
 batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream,
                          rmm::mr::device_memory_resource *mr) {
-  auto uint64_row_sizes = cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes));
+  auto uint64_row_sizes =
+      cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes));
   auto const total_size =
       thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows);
   auto const num_batches = static_cast<int32_t>(
-      util::div_rounding_up_safe(total_size, static_cast<uint64_t>(max_batch_size)));
+      util::div_rounding_up_safe(total_size, static_cast<uint64_t>(MAX_BATCH_SIZE)));
   auto const num_offsets = num_batches + 1;
   std::vector<row_batch> row_batches;
   std::vector<size_type> batch_row_boundaries;
@@ -1223,12 +1226,12 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream
   thrust::inclusive_scan(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows,
                          cumulative_row_sizes.begin());
 
-  while ((int)batch_row_boundaries.size() < num_offsets) {
-    // find the next max_batch_size boundary
+  while (static_cast<int>(batch_row_boundaries.size()) < num_offsets) {
+    // find the next MAX_BATCH_SIZE boundary
     size_type const row_end =
         ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(),
                               cumulative_row_sizes.begin() + (num_rows - last_row_end),
-                              max_batch_size) -
+                              MAX_BATCH_SIZE) -
           cumulative_row_sizes.begin()) +
          last_row_end);
 
@@ -1346,7 +1349,7 @@ build_blocks(device_span<block_info> blocks,
         int const max_row =
             std::min(total_number_of_rows - 1,
                      batch_index + 1 > num_batches ?
-                         std::numeric_limits<int>::max() :
+                         std::numeric_limits<size_type>::max() :
                          static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
         int const block_row_end = std::min(
             batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row);
@@ -1387,8 +1390,8 @@ void determine_windows(std::vector<size_type> const &column_sizes,
   // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
   // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
   // trick is that it's in bytes, not rows or columns.
-  size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block));
-  int const window_height =
+  auto const optimal_square_len = static_cast<size_type>(sqrt(shmem_limit_per_block));
+  auto const window_height =
       std::clamp(util::round_up_safe<int>(
                      std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32),
                  1, first_row_batch_size);
@@ -1403,14 +1406,15 @@ void determine_windows(std::vector<size_type> const &column_sizes,
   int row_size = 0;
 
   // march each column and build the blocks of appropriate sizes
-  for (unsigned int col = 0; col < column_sizes.size(); ++col) {
+  for (uint col = 0; col < column_sizes.size(); ++col) {
     auto const col_size = column_sizes[col];
 
     // align size for this type
     auto const alignment_needed = col_size; // They are the same for fixed width types
-    auto row_size_aligned = util::round_up_unsafe(row_size, alignment_needed);
-    auto row_size_with_this_col = row_size_aligned + col_size;
-    auto row_size_with_end_pad = util::round_up_unsafe(row_size_with_this_col, 8);
+    auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed);
+    auto const row_size_with_this_col = row_size_aligned + col_size;
+    auto const row_size_with_end_pad =
+        util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT);
 
     if (row_size_with_end_pad * window_height +
             calc_admin_data_size(col - current_window_start_col) >
@@ -1432,7 +1436,7 @@ void determine_windows(std::vector<size_type> const &column_sizes,
 
   // build last set of blocks
   if (current_window_width > 0) {
-    f(current_window_start_col, (int)column_sizes.size() - 1, window_height);
+    f(current_window_start_col, static_cast<int>(column_sizes.size()) - 1, window_height);
   }
 }
 
@@ -1444,8 +1448,8 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource *mr) {
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-  const size_type num_columns = tbl.num_columns();
-  const size_type num_rows = tbl.num_rows();
+  auto const num_columns = tbl.num_columns();
+  auto const num_rows = tbl.num_rows();
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
@@ -1454,7 +1458,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
 
   // TODO: why is this needed. kernel fails to launch if all memory is requested.
   total_shmem -= 1024;
-  int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
+  auto const shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
 
   // break up the work into blocks, which are a starting and ending row/col #.
   // this window size is calculated based on the shared memory size available
@@ -1506,7 +1510,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
       0, [fixed_width_size_per_row, num_columns] __device__(auto i) {
         auto const bytes_needed =
             fixed_width_size_per_row + util::div_rounding_up_safe<size_type>(num_columns, 8);
-        return util::round_up_unsafe(bytes_needed, 8);
+        return util::round_up_unsafe(bytes_needed, JCUDF_ROW_ALIGNMENT);
       });
 
   auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr);
@@ -1621,8 +1625,8 @@ convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_vi
 
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
     // splitting validity at a specific row offset.  This might change in the future.
-    int32_t const max_rows_per_batch =
-        util::round_down_safe(std::numeric_limits<int>::max() / size_per_row, 32);
+    auto const max_rows_per_batch =
+        util::round_down_safe(std::numeric_limits<size_type>::max() / size_per_row, 32);
 
     auto const num_rows = tbl.num_rows();
 
@@ -1695,7 +1699,8 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
 
   auto const validity_size = num_bitmask_words(num_columns) * 4;
 
-  auto const row_size = util::round_up_unsafe(fixed_width_size_per_row + validity_size, 8);
+  auto const row_size =
+      util::round_up_unsafe(fixed_width_size_per_row + validity_size, JCUDF_ROW_ALIGNMENT);
 
   // Ideally we would check that the offsets are all the same, etc. but for now
   // this is probably fine
@@ -1755,7 +1760,7 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
 
   dim3 blocks(
       util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
-  dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size()));
+  dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), static_cast<int>(child.size())));
 
   auto validity_block_infos =
       detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
@@ -1801,8 +1806,7 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     std::vector<size_type> column_size;
 
     auto const num_rows = input.parent().size();
-    int32_t const size_per_row =
-        detail::compute_fixed_width_layout(schema, column_start, column_size);
+    auto const size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
 
     // Ideally we would check that the offsets are all the same, etc. but for now
     // this is probably fine

From 3a488440bb5e59f5e95f93e86b35d2bba3920828 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 18 Nov 2021 04:48:56 +0000
Subject: [PATCH 69/80] removing odd size writing since destination is now
 padded

---
 java/src/main/native/src/row_conversion.cu | 39 ++--------------------
 1 file changed, 3 insertions(+), 36 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index b7840da9b30..8ee7b893dd9 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -375,8 +375,7 @@ struct row_batch {
  *
  */
 struct batch_data {
-  device_uvector<size_type>
-      batch_row_offsets; // offsets to each row in the JCUDF data from batch start
+  device_uvector<size_type> batch_row_offsets; // offset column of returned cudf column
   std::vector<size_type>
       batch_row_boundaries;           // row numbers for different batches like 0, 10000, 20000
   std::vector<row_batch> row_batches; // information about each batch such as byte count
@@ -607,23 +606,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
           if (threadIdx.x % warp_size == 0) {
-            if (cols_left <= 8) {
-              // write byte
-              this_shared_block[validity_write_offset] = validity_data & 0xFF;
-            } else if (cols_left <= 16) {
-              // write int16
-              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                  validity_data & 0xFFFF;
-            } else if (cols_left <= 24) {
-              // write int16 and then int8
-              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                  validity_data & 0xFFFF;
-              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-            } else {
-              // write int32
-              *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
-                  validity_data;
-            }
+            *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) = validity_data;
           }
         }
       }
@@ -911,23 +894,7 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
 
-            if (rows_left <= 8) {
-              // write byte
-              this_shared_block[validity_write_offset] = validity_data & 0xFF;
-            } else if (rows_left <= 16) {
-              // write int16
-              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                  validity_data & 0xFFFF;
-            } else if (rows_left <= 24) {
-              // write int16 and then int8
-              *reinterpret_cast<int16_t *>(&this_shared_block[validity_write_offset]) =
-                  validity_data & 0xFFFF;
-              shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF;
-            } else {
-              // write int32
-              *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) =
-                  validity_data;
-            }
+            *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) = validity_data;
           }
         }
       }

From 7595eaf36dd6fad557fa32a0ea383f5fbce13b36 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 7 Dec 2021 06:37:09 +0000
Subject: [PATCH 70/80] performance improvements

---
 java/src/main/native/src/row_conversion.cu | 165 +++++++++------------
 1 file changed, 73 insertions(+), 92 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 8ee7b893dd9..c44ac7343e7 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -489,20 +489,25 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
     auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
     subset_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
+    auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
     auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
     auto const column_offset = col_offsets[block.start_col];
     auto const block_output_buffer = output_data[block.batch_number];
 
-    // copy entire rows to final dest
-    for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row;
-         absolute_row += blockDim.x) {
-      auto const relative_row = absolute_row - block.start_row;
-      auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset;
-      auto const shared_offset = block_row_size * relative_row;
+    // copy entire row 8 bytes at a time
+    auto const chunks_per_row = util::div_rounding_up_unsafe(block_row_size, 8);
+    auto const total_chunks = chunks_per_row * block.num_rows();
 
-      cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset],
-                         cuda::aligned_size_t<8>(block_row_size), subset_barrier);
+    for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
+      // determine source address of my chunk
+      auto const relative_row = i / chunks_per_row;
+      auto const relative_chunk_offset = (i % chunks_per_row) * 8;
+      auto const output_dest = block_output_buffer + row_offsets[relative_row + block.start_row] +
+                               column_offset + relative_chunk_offset;
+      auto const input_src =
+          &shared[subset % stages_count][block_row_size * relative_row + relative_chunk_offset];
+
+      cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier);
     }
   }
 
@@ -588,7 +593,6 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
       auto const relative_row = section_y * 32;
       auto const absolute_col = relative_col + block.start_col;
       auto const absolute_row = relative_row + block.start_row;
-      auto const cols_left = num_columns - absolute_col;
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
 
       if (absolute_col < num_columns) {
@@ -618,15 +622,29 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
     auto const output_data_base =
         output_data[block.batch_number] + validity_offset + block.start_col / 8;
 
-    // now async memcpy the shared memory out to the final destination
-    for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) {
-      auto const relative_row = row - block.start_row;
-      auto const output_ptr = output_data_base + row_offsets[row];
-      auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
-
-      cuda::memcpy_async(
-          output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes,
-          shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+    // now async memcpy the shared memory out to the final destination 4 bytes at a time since we do
+    // 32-row chunks
+    auto const row_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+    auto const chunks_per_row = util::div_rounding_up_unsafe(row_bytes, 8);
+    auto const total_chunks = chunks_per_row * block.num_rows();
+    auto &subset_barrier =
+        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+    auto const tail_bytes = row_bytes % 8;
+
+    for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
+      // determine source address of my chunk
+      auto const relative_row = i / chunks_per_row;
+      auto const col_chunk = i % chunks_per_row;
+      auto const relative_chunk_offset = col_chunk * 8;
+      auto const output_dest =
+          output_data_base + row_offsets[relative_row + block.start_row] + relative_chunk_offset;
+      auto const input_src =
+          &this_shared_block[validity_data_row_length * relative_row + relative_chunk_offset];
+
+      if (tail_bytes > 0 && col_chunk == chunks_per_row - 1)
+        cuda::memcpy_async(output_dest, input_src, tail_bytes, subset_barrier);
+      else
+        cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier);
     }
   }
 
@@ -638,22 +656,6 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
   }
 }
 
-/**
- * @brief Admin data is data stored in shared memory that isn't actual column data
- *
- * @param col_size_size size of the column size data.
- * @param col_offset_size size of the column offset data.
- * @param num_cols number of columns in the block.
- * @return tuple of the size of column and offset admin data.
- */
-static __device__ std::tuple<size_type, size_type>
-get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) {
-  auto const col_size_bytes = num_cols * col_size_size;
-  auto const col_offset_bytes = num_cols * col_offset_size;
-
-  return {col_size_bytes, col_offset_bytes};
-}
-
 /**
  * @brief copy data from row-based format to cudf columns
  *
@@ -670,8 +672,8 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num
  */
 __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
                                const size_type shmem_used_per_block, const size_type *row_offsets,
-                               int8_t **output_data, const size_type *_col_sizes,
-                               const size_type *_col_offsets,
+                               int8_t **output_data, const size_type *col_sizes,
+                               const size_type *col_offsets,
                                device_span<const block_info> block_infos,
                                const int8_t *input_data) {
   // We are going to copy the data in two passes.
@@ -714,12 +716,8 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
       auto const fetch_block =
           block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index];
       auto const fetch_block_start_row = fetch_block.start_row;
-      auto const fetch_block_end_row = fetch_block.end_row;
-      auto const starting_col_offset = _col_offsets[fetch_block.start_col];
-      auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes);
-      auto const num_fetch_cols = fetch_block.num_cols();
-      auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-          sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols);
+      auto const starting_col_offset = col_offsets[fetch_block.start_col];
+      auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
       auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
 
       // if we have fetched all buffers, we need to wait for processing
@@ -728,22 +726,10 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
         fetch_barrier.arrive_and_wait();
       }
 
-      auto shared_row_offset = 0;
-      // copy the data for column sizes
-      cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
-                         &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier);
-      shared_row_offset += col_size_bytes;
-      // copy the data for column offsets
-      cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset],
-                         &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier);
-      shared_row_offset += col_offset_bytes;
-      shared_row_offset = util::round_up_unsafe(shared_row_offset, 8);
-
       for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
-           row <= fetch_block_end_row; row += blockDim.x) {
-        auto shared_offset =
-            (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset;
-        // copy the main
+           row <= fetch_block.end_row; row += blockDim.x) {
+        auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size;
+        // copy the data
         cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset],
                            &input_data[row_offsets[row] + starting_col_offset],
                            fetch_block_row_size, fetch_barrier);
@@ -755,19 +741,10 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
     // ensure our data is ready
     processing_barrier.arrive_and_wait();
 
-    auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index];
+    auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index];
     auto const rows_in_block = block.num_rows();
     auto const cols_in_block = block.num_cols();
-
-    auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(
-        sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block);
-    auto shared_col_sizes = reinterpret_cast<size_type *>(shared[processing_index % stages_count]);
-    auto shared_col_offsets =
-        reinterpret_cast<size_type *>(&shared[processing_index % stages_count][col_size_bytes]);
-
-    auto const shared_row_offset = util::round_up_unsafe(col_size_bytes + col_offset_bytes, 8);
-
-    auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes);
+    auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
 
     // now we copy from shared memory to final destination.
     // the data is laid out in rows in shared memory, so the reads
@@ -783,9 +760,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
       auto const absolute_row = relative_row + block.start_row;
 
       auto const shared_memory_row_offset = block_row_size * relative_row;
-      auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] +
-                                        shared_memory_row_offset + shared_row_offset;
-      auto const column_size = shared_col_sizes[relative_col];
+      auto const shared_memory_offset =
+          col_offsets[absolute_col] - col_offsets[block.start_col] + shared_memory_row_offset;
+      auto const column_size = col_sizes[absolute_col];
 
       int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset];
       int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
@@ -875,7 +852,6 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ
       auto const relative_row = section_y * 32 + lane_id;
       auto const absolute_col = relative_col + block_start_col;
       auto const absolute_row = relative_row + block_start_row;
-      auto const rows_left = num_rows - absolute_row;
 
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
 
@@ -903,15 +879,29 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ
     // make sure entire block has finished copy
     group.sync();
 
-    // now async memcpy the shared
-    for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) {
-      auto const relative_col = col - block.start_col;
-      auto const starting_address = output_nm[col] + word_index(block_start_row);
-
-      cuda::memcpy_async(
-          starting_address, &this_shared_block[validity_data_col_length * relative_col],
-          util::div_rounding_up_unsafe(num_block_rows, 8),
-          shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]);
+    // now async memcpy the shared memory out to the final destination 8 bytes at a time
+    auto const col_bytes = util::div_rounding_up_unsafe(num_block_rows, 8);
+    auto const chunks_per_col = util::div_rounding_up_unsafe(col_bytes, 8);
+    auto const total_chunks = chunks_per_col * num_block_cols;
+    auto &subset_barrier =
+        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+    auto const tail_bytes = col_bytes % 8;
+
+    for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
+      // determine source address of my chunk
+      auto const relative_col = i / chunks_per_col;
+      auto const row_chunk = i % chunks_per_col;
+      auto const absolute_col = relative_col + block_start_col;
+      auto const relative_chunk_byte_offset = row_chunk * 8;
+      auto const output_dest =
+          output_nm[absolute_col] + word_index(block_start_row) + row_chunk * 2;
+      auto const input_src =
+          &this_shared_block[validity_data_col_length * relative_col + relative_chunk_byte_offset];
+
+      if (tail_bytes > 0 && row_chunk == chunks_per_col - 1)
+        cuda::memcpy_async(output_dest, input_src, tail_bytes, subset_barrier);
+      else
+        cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier);
     }
   }
 
@@ -1363,13 +1353,6 @@ void determine_windows(std::vector<size_type> const &column_sizes,
                      std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32),
                  1, first_row_batch_size);
 
-  auto calc_admin_data_size = [](int num_cols) -> size_type {
-    // admin data is the column sizes and column start information.
-    // this is copied to shared memory as well and needs to be accounted for
-    // in the window calculation.
-    return num_cols * sizeof(size_type) + num_cols * sizeof(size_type);
-  };
-
   int row_size = 0;
 
   // march each column and build the blocks of appropriate sizes
@@ -1383,9 +1366,7 @@ void determine_windows(std::vector<size_type> const &column_sizes,
     auto const row_size_with_end_pad =
         util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT);
 
-    if (row_size_with_end_pad * window_height +
-            calc_admin_data_size(col - current_window_start_col) >
-        shmem_limit_per_block) {
+    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
       // too large, close this window, generate vertical blocks and restart
       f(current_window_start_col, col == 0 ? col : col - 1, window_height);
 

From 74afad7c64e1f1c6529adfe4ca7df6baa7748cc1 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Tue, 14 Dec 2021 21:10:07 -0500
Subject: [PATCH 71/80] Update java/src/main/native/src/row_conversion.cu

Co-authored-by: MithunR <mythrocks@gmail.com>
---
 java/src/main/native/src/row_conversion.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index c44ac7343e7..15a7fbf02e3 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -75,7 +75,7 @@ namespace java {
 namespace detail {
 
 /**
- * @brief Copies data from row-base JCUDF format to column-based cudf format.
+ * @brief Copies data from row-based JCUDF format to column-based cudf format.
  *
  * This optimized version of the conversion is faster for fixed-width tables
  * that do not have more than 100 columns.

From 57a84e4e2663ea2a291892caa76f4cc2640a3a9d Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 23 Dec 2021 02:03:02 +0000
Subject: [PATCH 72/80] changes from review comments

---
 java/src/main/native/src/row_conversion.cu | 1058 +++++++++++---------
 1 file changed, 595 insertions(+), 463 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 15a7fbf02e3..c1b6bdbce5d 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -14,21 +14,7 @@
  * limitations under the License.
  */
 
-#include <algorithm>
-#include <cstdarg>
-#include <cstdint>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <tuple>
-
 #include <cooperative_groups.h>
-#include <type_traits>
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-#include <cuda/barrier>
-#endif
-
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/sequence.hpp>
@@ -50,17 +36,31 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
+#include <type_traits>
 
 #include "row_conversion.hpp"
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2;
-constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2;
-constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2;
-constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8;
-constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2;
+#include <cuda/barrier>
+#endif
+
+#include <algorithm>
+#include <cstdarg>
+#include <cstdint>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <tuple>
 
 constexpr auto JCUDF_ROW_ALIGNMENT = 8;
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+constexpr auto NUM_TILES_PER_KERNEL_FROM_ROWS = 2;
+constexpr auto NUM_TILES_PER_KERNEL_TO_ROWS = 2;
+constexpr auto NUM_TILES_PER_KERNEL_LOADED = 2;
+constexpr auto NUM_VALIDITY_TILES_PER_KERNEL = 8;
+constexpr auto NUM_VALIDITY_TILES_PER_KERNEL_LOADED = 2;
+
 constexpr auto MAX_BATCH_SIZE = std::numeric_limits<cudf::size_type>::max();
 
 // needed to suppress warning about cuda::barrier
@@ -71,9 +71,139 @@ using namespace cudf;
 using detail::make_device_uvector_async;
 using rmm::device_uvector;
 namespace cudf {
-namespace java {
+namespace jni {
 namespace detail {
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/************************************************************************
+ * This module converts data from row-major to column-major and from column-major
+ * to row-major. It is a transpose of the data of sorts, but there are a few
+ * complicating factors. They are spelled out below:
+ *
+ * Row Batches:
+ * The row data has to fit inside a
+ * cuDF column, which limits it to 2 gigs currently. The calling code attempts
+ * to keep the data size under 2 gigs, but due to padding this isn't always
+ * the case, so being able to break this up into multiple columns is necessary.
+ * Internally, this is referred to as the row batch, which is a group of rows
+ * that will fit into this 2 gig space requirement. There are typically 1 of
+ * these batches, but there can be 2.
+ *
+ * Async Memcpy:
+ * The CUDA blocks are using memcpy_async, which allows for the device to
+ * schedule memcpy operations and then wait on them to complete at a later
+ * time with a barrier. The recommendation is to double-buffer the work
+ * so that processing can occur while a copy operation is being completed.
+ * On Ampere or later hardware there is dedicated hardware to do this copy
+ * and on pre-Ampere it should generate the same code that a hand-rolled
+ * loop would generate, so performance should be the same or better than
+ * a hand-rolled kernel.
+ *
+ * Tile Info:
+ * Each CUDA block will work on NUM_TILES_PER_KERNEL_*_ROWS tile infos
+ * before exiting. It will have enough shared memory available to load
+ * NUM_TILES_PER_KERNEL_LOADED tiles at one time. The block will load
+ * as many tiles as it can fit into shared memory and then wait on the
+ * first tile to completely load before processing. Processing in this
+ * case means coping the data from shared memory back out to device
+ * memory via memcpy_async. This kernel is completely memory bound.
+ *
+ * Batch Data:
+ * This structure contains all the row batches and some book-keeping
+ * data necessary for the batches such as row numbers for the batches.
+ *
+ * Tiles:
+ * The tile info describes a tile of data to process. In a GPU with
+ * 48KB of shared memory each tile uses approximately 24KB of memory
+ * which equates to about 144 bytes in each direction. The tiles are
+ * kept as square as possible to attempt to coalesce memory operations.
+ * The taller a tile is the better coalescing of columns, but row
+ * coalescing suffers. The wider a tile is the better the row coalescing,
+ * but columns coalescing suffers. The code attempts to produce a square
+ * tile to balance the coalescing. It starts by figuring out the optimal
+ * byte length and then adding columns to the data until the tile is too
+ * large. Since rows are different width with different alignment
+ * requirements, this isn't typically exact. Once a width is found the
+ * tiles are generated vertically with that width and height and then
+ * the process repeats. This means all the tiles will be the same
+ * height, but will have different widths based on what columns they
+ * encompass. Tiles in a vertical row will all have the same dimensions.
+ *
+ *   --------------------------------
+ *   | 4   5.0f || True   8   3   1 |
+ *   | 3   6.0f || False  3   1   1 |
+ *   | 2   7.0f || True   7   4   1 |
+ *   | 1   8.0f || False  2   5   1 |
+ *   --------------------------------
+ *   | 0   9.0f || True   6   7   1 |
+ *   ...
+ ************************************************************************/
+
+/**
+ * @brief The CUDA blocks work on one or more tile_info structs of data.
+ *        This structure defines the workspaces for the blocks.
+ *
+ */
+struct tile_info {
+  int start_col;
+  int start_row;
+  int end_col;
+  int end_row;
+  int batch_number;
+
+  CUDA_DEVICE_CALLABLE
+  size_type get_shared_row_size(size_type const *const col_offsets,
+                                size_type const *const col_sizes) const {
+    return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col],
+                                 JCUDF_ROW_ALIGNMENT);
+  }
+
+  CUDA_DEVICE_CALLABLE
+  size_type num_cols() const { return end_col - start_col + 1; }
+
+  CUDA_DEVICE_CALLABLE
+  size_type num_rows() const { return end_row - start_row + 1; }
+};
+
+/**
+ * @brief Returning rows is done in a byte cudf column. This is limited in size by
+ *        `size_type` and so output is broken into batches of rows that fit inside
+ *        this limit.
+ *
+ */
+struct row_batch {
+  size_type num_bytes;                   // number of bytes in this batch
+  size_type row_count;                   // number of rows in the batch
+  device_uvector<size_type> row_offsets; // offsets column of output cudf column
+};
+
+/**
+ * @brief Holds information about the batches of data to be processed
+ *
+ */
+struct batch_data {
+  device_uvector<size_type> batch_row_offsets;      // offset column of returned cudf column
+  device_uvector<size_type> d_batch_row_boundaries; // row numbers for the start of each batch
+  std::vector<size_type>
+      batch_row_boundaries;           // row numbers for the start of each batch: 0, 1500, 2700
+  std::vector<row_batch> row_batches; // information about each batch such as byte count
+};
+
+struct row_offset_functor {
+  row_offset_functor(size_type fixed_width_only_row_size)
+      : _fixed_width_only_row_size(fixed_width_only_row_size){};
+
+  CUDA_DEVICE_CALLABLE
+  size_type operator()(int row_number, int tile_row_start) {
+    return (row_number - tile_row_start) * _fixed_width_only_row_size;
+  }
+
+  size_type _fixed_width_only_row_size;
+};
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
 /**
  * @brief Copies data from row-based JCUDF format to column-based cudf format.
  *
@@ -336,132 +466,92 @@ __global__ void copy_to_rows_fixed_width_optimized(
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 
-/**
- * @brief The GPU blocks work on one or more block_info structs of data.
- *        This structure defined the workspace for the block.
- *
- */
-struct block_info {
-  int start_col;
-  int start_row;
-  int end_col;
-  int end_row;
-  int batch_number;
-
-  constexpr size_type get_shared_row_size(size_type const *const col_offsets,
-                                          size_type const *const col_sizes) const {
-    return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col],
-                                 JCUDF_ROW_ALIGNMENT);
-  }
-  constexpr size_type num_cols() const { return end_col - start_col + 1; }
-
-  constexpr size_type num_rows() const { return end_row - start_row + 1; }
-};
-
-/**
- * @brief Returning rows is done in a byte cudf column. This is limited in size by
- *        `size_type` and so output is broken into batches of rows that fit inside
- *        this limit.
- *
- */
-struct row_batch {
-  size_type num_bytes;                   // number of bytes in this batch
-  size_type row_count;                   // number of rows in the batch
-  device_uvector<size_type> row_offsets; // offsets column of output cudf column
-};
-
-/**
- * @brief Holds information about the batches of data to be processed
- *
- */
-struct batch_data {
-  device_uvector<size_type> batch_row_offsets; // offset column of returned cudf column
-  std::vector<size_type>
-      batch_row_boundaries;           // row numbers for different batches like 0, 10000, 20000
-  std::vector<row_batch> row_batches; // information about each batch such as byte count
-};
-
 /**
  * @brief copy data from cudf columns into JCUDF format, which is row-based
  *
+ * @tparam RowOffsetIter iterator that gives the size of a specific row of the table.
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
- * @param shmem_used_per_block shared memory amount each `block_info` is using
- * @param block_infos span of `block_info` structs the define the work
+ * @param shmem_used_per_tile shared memory amount each `tile_info` is using
+ * @param tile_infos span of `tile_info` structs the define the work
  * @param input_data pointer to raw table data
  * @param col_sizes array of sizes for each element in a column - one per column
  * @param col_offsets offset into input data row for each column's start
  * @param row_offsets offset to a specific row in the output data
+ * @param batch_row_boundaries row numbers for batch starts
  * @param output_data pointer to output data
  *
  */
+template <typename RowOffsetIter>
 __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
-                             const size_type shmem_used_per_block,
-                             device_span<const block_info> block_infos, const int8_t **input_data,
+                             const size_type shmem_used_per_tile,
+                             device_span<const tile_info> tile_infos, const int8_t **input_data,
                              const size_type *col_sizes, const size_type *col_offsets,
-                             const size_type *row_offsets, int8_t **output_data) {
+                             RowOffsetIter row_offsets, size_type const *batch_row_boundaries,
+                             int8_t **output_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
 
   // Because shared memory is limited we copy a subset of the rows at a time.
-  // This has been broken up for us in the block_info struct, so we don't have
+  // This has been broken up for us in the tile_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
-  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  constexpr unsigned stages_count = NUM_TILES_PER_KERNEL_LOADED;
   auto group = cooperative_groups::this_thread_block();
   extern __shared__ int8_t shared_data[];
-  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_tile};
 
-  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+  __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier[NUM_TILES_PER_KERNEL_LOADED];
   if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&block_barrier[i], group.size());
+    for (int i = 0; i < NUM_TILES_PER_KERNEL_LOADED; ++i) {
+      init(&tile_barrier[i], group.size());
     }
   }
 
   group.sync();
 
-  auto const blocks_remaining =
-      std::min(static_cast<uint>(block_infos.size()) - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS,
-               static_cast<uint>(NUM_BLOCKS_PER_KERNEL_TO_ROWS));
-
-  size_t fetch;
-  size_t subset;
-  for (subset = fetch = 0; subset < blocks_remaining; ++subset) {
-    // Fetch ahead up to stages_count subsets
-    for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) {
-      auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch];
-      auto const num_fetch_cols = fetch_block.num_cols();
-      auto const num_fetch_rows = fetch_block.num_rows();
-      auto const num_elements_in_block = num_fetch_cols * num_fetch_rows;
-      auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-      auto const starting_column_offset = col_offsets[fetch_block.start_col];
-      auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED];
+  auto const tiles_remaining =
+      std::min(static_cast<uint>(tile_infos.size()) - blockIdx.x * NUM_TILES_PER_KERNEL_TO_ROWS,
+               static_cast<uint>(NUM_TILES_PER_KERNEL_TO_ROWS));
+
+  size_t fetch_index;      //< tile we are currently fetching
+  size_t processing_index; //< tile we are currently processing
+  for (processing_index = fetch_index = 0; processing_index < tiles_remaining; ++processing_index) {
+    // Fetch ahead up to NUM_TILES_PER_KERNEL_LOADED
+    for (; fetch_index < tiles_remaining && fetch_index < (processing_index + stages_count);
+         ++fetch_index) {
+      auto const fetch_tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_TO_ROWS + fetch_index];
+      auto const num_fetch_cols = fetch_tile.num_cols();
+      auto const num_fetch_rows = fetch_tile.num_rows();
+      auto const num_elements_in_tile = num_fetch_cols * num_fetch_rows;
+      auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes);
+      auto const starting_column_offset = col_offsets[fetch_tile.start_col];
+      auto &fetch_barrier = tile_barrier[fetch_index % NUM_TILES_PER_KERNEL_LOADED];
 
       // wait for the last use of the memory to be completed
-      if (fetch >= NUM_BLOCKS_PER_KERNEL_LOADED) {
+      if (fetch_index >= NUM_TILES_PER_KERNEL_LOADED) {
         fetch_barrier.arrive_and_wait();
       }
 
       // to do the copy we need to do n column copies followed by m element copies OR
       // we have to do m element copies followed by r row copies. When going from column
       // to row it is much easier to copy by elements first otherwise we would need a running
-      // total of the column sizes for our block, which isn't readily available. This makes it
+      // total of the column sizes for our tile, which isn't readily available. This makes it
       // more appealing to copy element-wise from input data into shared matching the end layout
       // and do row-based memcopies out.
 
-      auto const shared_buffer_base = shared[fetch % stages_count];
-      for (auto el = static_cast<int>(threadIdx.x); el < num_elements_in_block; el += blockDim.x) {
+      auto const shared_buffer_base = shared[fetch_index % stages_count];
+      for (auto el = static_cast<int>(threadIdx.x); el < num_elements_in_tile; el += blockDim.x) {
         auto const relative_col = el / num_fetch_rows;
         auto const relative_row = el % num_fetch_rows;
-        auto const absolute_col = relative_col + fetch_block.start_col;
-        auto const absolute_row = relative_row + fetch_block.start_row;
+        auto const absolute_col = relative_col + fetch_tile.start_col;
+        auto const absolute_row = relative_row + fetch_tile.start_row;
         auto const col_size = col_sizes[absolute_col];
         auto const col_offset = col_offsets[absolute_col];
         auto const relative_col_offset = col_offset - starting_column_offset;
 
-        auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset;
+        auto const shared_offset = relative_row * fetch_tile_row_size + relative_col_offset;
         auto const input_src = input_data[absolute_col] + col_size * absolute_row;
 
         // copy the element from global memory
@@ -486,59 +576,65 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
       }
     }
 
-    auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED];
-    subset_barrier.arrive_and_wait();
+    auto &processing_barrier = tile_barrier[processing_index % NUM_TILES_PER_KERNEL_LOADED];
+    processing_barrier.arrive_and_wait();
 
-    auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset];
-    auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
-    auto const column_offset = col_offsets[block.start_col];
-    auto const block_output_buffer = output_data[block.batch_number];
+    auto const tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_TO_ROWS + processing_index];
+    auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
+    auto const column_offset = col_offsets[tile.start_col];
+    auto const tile_output_buffer = output_data[tile.batch_number];
+    auto const row_batch_start =
+        tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
 
     // copy entire row 8 bytes at a time
-    auto const chunks_per_row = util::div_rounding_up_unsafe(block_row_size, 8);
-    auto const total_chunks = chunks_per_row * block.num_rows();
+    auto const chunks_per_row = util::div_rounding_up_unsafe(tile_row_size, 8);
+    auto const total_chunks = chunks_per_row * tile.num_rows();
 
     for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
       // determine source address of my chunk
       auto const relative_row = i / chunks_per_row;
       auto const relative_chunk_offset = (i % chunks_per_row) * 8;
-      auto const output_dest = block_output_buffer + row_offsets[relative_row + block.start_row] +
+      auto const output_dest = tile_output_buffer +
+                               row_offsets(relative_row + tile.start_row, row_batch_start) +
                                column_offset + relative_chunk_offset;
-      auto const input_src =
-          &shared[subset % stages_count][block_row_size * relative_row + relative_chunk_offset];
+      auto const input_src = &shared[processing_index % stages_count]
+                                    [tile_row_size * relative_row + relative_chunk_offset];
 
-      cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier);
+      cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier);
     }
   }
 
   // wait on the last copies to complete
-  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-    block_barrier[i].arrive_and_wait();
+  for (uint i = 0; i < std::min(stages_count, tiles_remaining); ++i) {
+    tile_barrier[i].arrive_and_wait();
   }
 }
 
 /**
  * @brief copy data from row-based format to cudf columns
  *
+ * @tparam RowOffsetIter iterator that gives the size of a specific row of the table.
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param shmem_used_per_tile amount of shared memory that is used by a tile
  * @param row_offsets offset to a specific row in the output data
+ * @param batch_row_boundaries row numbers for batch starts
  * @param output_data pointer to output data, partitioned by data size
  * @param validity_offsets offset into input data row for validity data
- * @param block_infos information about the blocks of work
- * @param input_data pointer to input data
+ * @param tile_infos information about the tiles of work
+ * @param input_nm pointer to input data
  *
  */
-__global__ void copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
-                                      const size_type shmem_used_per_block,
-                                      const size_type *row_offsets, int8_t **output_data,
-                                      const size_type validity_offset,
-                                      device_span<const block_info> block_infos,
-                                      const bitmask_type **input_nm) {
+template <typename RowOffsetIter>
+__global__ void
+copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
+                      const size_type shmem_used_per_tile, RowOffsetIter row_offsets,
+                      size_type const *batch_row_boundaries, int8_t **output_data,
+                      const size_type validity_offset, device_span<const tile_info> tile_infos,
+                      const bitmask_type **input_nm) {
   extern __shared__ int8_t shared_data[];
-  int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-      shared_data, shared_data + shmem_used_per_block / 2};
+  int8_t *shared_tiles[NUM_VALIDITY_TILES_PER_KERNEL_LOADED] = {
+      shared_data, shared_data + shmem_used_per_tile / 2};
 
   using cudf::detail::warp_size;
 
@@ -548,51 +644,50 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
   // probably need knobs for number of rows vs columns to balance read/write
   auto group = cooperative_groups::this_thread_block();
 
-  int const blocks_remaining =
-      std::min(static_cast<uint>(block_infos.size()) - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-               static_cast<uint>(NUM_VALIDITY_BLOCKS_PER_KERNEL));
+  int const tiles_remaining =
+      std::min(static_cast<uint>(tile_infos.size()) - blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL,
+               static_cast<uint>(NUM_VALIDITY_TILES_PER_KERNEL));
 
   __shared__ cuda::barrier<cuda::thread_scope_block>
-      shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+      shared_tile_barriers[NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
   if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&shared_block_barriers[i], group.size());
+    for (int i = 0; i < NUM_VALIDITY_TILES_PER_KERNEL_LOADED; ++i) {
+      init(&shared_tile_barriers[i], group.size());
     }
   }
 
   group.sync();
 
-  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
-      shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]
-          .arrive_and_wait();
+  for (int validity_tile = 0; validity_tile < tiles_remaining; ++validity_tile) {
+    if (validity_tile >= NUM_VALIDITY_TILES_PER_KERNEL_LOADED) {
+      shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED].arrive_and_wait();
     }
-    int8_t *this_shared_block = shared_blocks[validity_block % 2];
-    auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
+    int8_t *this_shared_tile = shared_tiles[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
+    auto tile = tile_infos[blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL + validity_tile];
 
-    auto const num_block_cols = block.num_cols();
-    auto const num_block_rows = block.num_rows();
+    auto const num_tile_cols = tile.num_cols();
+    auto const num_tile_rows = tile.num_rows();
 
-    auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32);
-    auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32);
+    auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, 32);
+    auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, 32);
     auto const validity_data_row_length =
-        util::round_up_unsafe(util::div_rounding_up_unsafe(num_block_cols, 8), JCUDF_ROW_ALIGNMENT);
+        util::round_up_unsafe(util::div_rounding_up_unsafe(num_tile_cols, 8), JCUDF_ROW_ALIGNMENT);
     auto const total_sections = num_sections_x * num_sections_y;
 
     int const warp_id = threadIdx.x / warp_size;
     int const lane_id = threadIdx.x % warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
+    auto const warps_per_tile = std::max(1u, blockDim.x / warp_size);
 
-    // the block is divided into sections. A warp operates on a section at a time.
+    // the tile is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
-         my_section_idx += warps_per_block) {
+         my_section_idx += warps_per_tile) {
       // convert to rows and cols
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
       auto const relative_col = section_x * 32 + lane_id;
       auto const relative_row = section_y * 32;
-      auto const absolute_col = relative_col + block.start_col;
-      auto const absolute_row = relative_row + block.start_row;
+      auto const absolute_col = relative_col + tile.start_col;
+      auto const absolute_row = relative_row + tile.start_row;
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
 
       if (absolute_col < num_columns) {
@@ -610,141 +705,145 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type
           auto const validity_write_offset =
               validity_data_row_length * (relative_row + i) + relative_col / 8;
           if (threadIdx.x % warp_size == 0) {
-            *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) = validity_data;
+            *reinterpret_cast<int32_t *>(&this_shared_tile[validity_write_offset]) = validity_data;
           }
         }
       }
     }
 
-    // make sure entire block has finished copy
+    // make sure entire tile has finished copy
     group.sync();
 
     auto const output_data_base =
-        output_data[block.batch_number] + validity_offset + block.start_col / 8;
+        output_data[tile.batch_number] + validity_offset + tile.start_col / 8;
 
     // now async memcpy the shared memory out to the final destination 4 bytes at a time since we do
     // 32-row chunks
-    auto const row_bytes = util::div_rounding_up_unsafe(num_block_cols, 8);
+    auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, 8);
     auto const chunks_per_row = util::div_rounding_up_unsafe(row_bytes, 8);
-    auto const total_chunks = chunks_per_row * block.num_rows();
-    auto &subset_barrier =
-        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+    auto const total_chunks = chunks_per_row * tile.num_rows();
+    auto &processing_barrier =
+        shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
     auto const tail_bytes = row_bytes % 8;
+    auto const row_batch_start =
+        tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
 
     for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
       // determine source address of my chunk
       auto const relative_row = i / chunks_per_row;
       auto const col_chunk = i % chunks_per_row;
       auto const relative_chunk_offset = col_chunk * 8;
-      auto const output_dest =
-          output_data_base + row_offsets[relative_row + block.start_row] + relative_chunk_offset;
+      auto const output_dest = output_data_base +
+                               row_offsets(relative_row + tile.start_row, row_batch_start) +
+                               relative_chunk_offset;
       auto const input_src =
-          &this_shared_block[validity_data_row_length * relative_row + relative_chunk_offset];
+          &this_shared_tile[validity_data_row_length * relative_row + relative_chunk_offset];
 
       if (tail_bytes > 0 && col_chunk == chunks_per_row - 1)
-        cuda::memcpy_async(output_dest, input_src, tail_bytes, subset_barrier);
+        cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier);
       else
-        cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier);
+        cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier);
     }
   }
 
-  // wait for last blocks of data to arrive
-  for (int validity_block = 0;
-       validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-       ++validity_block) {
-    shared_block_barriers[validity_block].arrive_and_wait();
+  // wait for last tiles of data to arrive
+  for (int validity_tile = 0;
+       validity_tile < tiles_remaining % NUM_VALIDITY_TILES_PER_KERNEL_LOADED; ++validity_tile) {
+    shared_tile_barriers[validity_tile].arrive_and_wait();
   }
 }
 
 /**
  * @brief copy data from row-based format to cudf columns
  *
+ * @tparam RowOffsetIter iterator that gives the size of a specific row of the table.
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param shmem_used_per_tile amount of shared memory that is used by a tile
  * @param row_offsets offset to a specific row in the input data
+ * @param batch_row_boundaries row numbers for batch starts
  * @param output_data pointers to column data
  * @param col_sizes array of sizes for each element in a column - one per column
  * @param col_offsets offset into input data row for each column's start
- * @param block_infos information about the blocks of work
+ * @param tile_infos information about the tiles of work
  * @param input_data pointer to input data
  *
  */
+template <typename RowOffsetIter>
 __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
-                               const size_type shmem_used_per_block, const size_type *row_offsets,
-                               int8_t **output_data, const size_type *col_sizes,
-                               const size_type *col_offsets,
-                               device_span<const block_info> block_infos,
-                               const int8_t *input_data) {
+                               const size_type shmem_used_per_tile, RowOffsetIter row_offsets,
+                               size_type const *batch_row_boundaries, int8_t **output_data,
+                               const size_type *col_sizes, const size_type *col_offsets,
+                               device_span<const tile_info> tile_infos, const int8_t *input_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
 
   // Because shared memory is limited we copy a subset of the rows at a time.
-  // This has been broken up for us in the block_info struct, so we don't have
+  // This has been broken up for us in the tile_info struct, so we don't have
   // any calculation to do here, but it is important to note.
 
   // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
-  // to shared memory for each of the blocks that we work on
+  // to shared memory for each of the tiles that we work on
 
-  constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED;
+  constexpr unsigned stages_count = NUM_TILES_PER_KERNEL_LOADED;
   auto group = cooperative_groups::this_thread_block();
   extern __shared__ int8_t shared_data[];
-  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block};
+  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_tile};
 
-  __shared__ cuda::barrier<cuda::thread_scope_block> block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED];
+  __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier[NUM_TILES_PER_KERNEL_LOADED];
   if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&block_barrier[i], group.size());
+    for (int i = 0; i < NUM_TILES_PER_KERNEL_LOADED; ++i) {
+      init(&tile_barrier[i], group.size());
     }
   }
 
   group.sync();
 
-  auto blocks_remaining =
-      std::min(static_cast<uint>(block_infos.size()) - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS,
-               static_cast<uint>(NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
+  auto tiles_remaining =
+      std::min(static_cast<uint>(tile_infos.size()) - blockIdx.x * NUM_TILES_PER_KERNEL_FROM_ROWS,
+               static_cast<uint>(NUM_TILES_PER_KERNEL_FROM_ROWS));
 
   size_t fetch_index;
   size_t processing_index;
-  for (processing_index = fetch_index = 0; processing_index < blocks_remaining;
-       ++processing_index) {
+  for (processing_index = fetch_index = 0; processing_index < tiles_remaining; ++processing_index) {
     // Fetch ahead up to stages_count groups
-    for (; fetch_index < static_cast<size_t>(blocks_remaining) &&
+    for (; fetch_index < static_cast<size_t>(tiles_remaining) &&
            fetch_index < (processing_index + stages_count);
          ++fetch_index) {
-      auto const fetch_block =
-          block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index];
-      auto const fetch_block_start_row = fetch_block.start_row;
-      auto const starting_col_offset = col_offsets[fetch_block.start_col];
-      auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes);
-      auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+      auto const fetch_tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_FROM_ROWS + fetch_index];
+      auto const fetch_tile_start_row = fetch_tile.start_row;
+      auto const starting_col_offset = col_offsets[fetch_tile.start_col];
+      auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes);
+      auto &fetch_barrier = tile_barrier[fetch_index % NUM_TILES_PER_KERNEL_LOADED];
+      auto const row_batch_start =
+          fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number];
 
       // if we have fetched all buffers, we need to wait for processing
       // to complete on them before we can use them again
-      if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) {
+      if (fetch_index > NUM_TILES_PER_KERNEL_LOADED) {
         fetch_barrier.arrive_and_wait();
       }
 
-      for (auto row = fetch_block_start_row + static_cast<int>(threadIdx.x);
-           row <= fetch_block.end_row; row += blockDim.x) {
-        auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size;
+      for (auto row = fetch_tile_start_row + static_cast<int>(threadIdx.x);
+           row <= fetch_tile.end_row; row += blockDim.x) {
+        auto shared_offset = (row - fetch_tile_start_row) * fetch_tile_row_size;
         // copy the data
         cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset],
-                           &input_data[row_offsets[row] + starting_col_offset],
-                           fetch_block_row_size, fetch_barrier);
+                           &input_data[row_offsets(row, row_batch_start) + starting_col_offset],
+                           fetch_tile_row_size, fetch_barrier);
       }
     }
 
-    auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED];
+    auto &processing_barrier = tile_barrier[processing_index % NUM_TILES_PER_KERNEL_LOADED];
 
     // ensure our data is ready
     processing_barrier.arrive_and_wait();
 
-    auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index];
-    auto const rows_in_block = block.num_rows();
-    auto const cols_in_block = block.num_cols();
-    auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes);
+    auto const tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_FROM_ROWS + processing_index];
+    auto const rows_in_tile = tile.num_rows();
+    auto const cols_in_tile = tile.num_cols();
+    auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
 
     // now we copy from shared memory to final destination.
     // the data is laid out in rows in shared memory, so the reads
@@ -753,15 +852,15 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
     // to prevent each thread working on a single row and also to ensure
     // that all threads can do work in the case of more threads than rows,
     // we do a global index instead of a double for loop with col/row.
-    for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) {
-      auto const relative_col = index % cols_in_block;
-      auto const relative_row = index / cols_in_block;
-      auto const absolute_col = relative_col + block.start_col;
-      auto const absolute_row = relative_row + block.start_row;
+    for (int index = threadIdx.x; index < rows_in_tile * cols_in_tile; index += blockDim.x) {
+      auto const relative_col = index % cols_in_tile;
+      auto const relative_row = index / cols_in_tile;
+      auto const absolute_col = relative_col + tile.start_col;
+      auto const absolute_row = relative_row + tile.start_row;
 
-      auto const shared_memory_row_offset = block_row_size * relative_row;
+      auto const shared_memory_row_offset = tile_row_size * relative_row;
       auto const shared_memory_offset =
-          col_offsets[absolute_col] - col_offsets[block.start_col] + shared_memory_row_offset;
+          col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset;
       auto const column_size = col_sizes[absolute_col];
 
       int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset];
@@ -773,33 +872,36 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
   }
 
   // wait on the last copies to complete
-  for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) {
-    block_barrier[i].arrive_and_wait();
+  for (uint i = 0; i < std::min(stages_count, tiles_remaining); ++i) {
+    tile_barrier[i].arrive_and_wait();
   }
 }
 
 /**
  * @brief copy data from row-based format to cudf columns
  *
+ * @tparam RowOffsetIter iterator that gives the size of a specific row of the table.
  * @param num_rows total number of rows in the table
  * @param num_columns total number of columns in the table
- * @param shmem_used_per_block amount of shared memory that is used by a block
+ * @param shmem_used_per_tile amount of shared memory that is used by a tile
  * @param row_offsets offset to a specific row in the input data
+ * @param batch_row_boundaries row numbers for batch starts
  * @param output_nm pointers to null masks for columns
  * @param validity_offsets offset into input data row for validity data
- * @param block_infos information about the blocks of work
+ * @param tile_infos information about the tiles of work
  * @param input_data pointer to input data
  *
  */
-__global__ void copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
-                                        const size_type shmem_used_per_block,
-                                        const size_type *row_offsets, bitmask_type **output_nm,
-                                        const size_type validity_offset,
-                                        device_span<const block_info> block_infos,
-                                        const int8_t *input_data) {
+template <typename RowOffsetIter>
+__global__ void
+copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
+                        const size_type shmem_used_per_tile, RowOffsetIter row_offsets,
+                        size_type const *batch_row_boundaries, bitmask_type **output_nm,
+                        const size_type validity_offset, device_span<const tile_info> tile_infos,
+                        const int8_t *input_data) {
   extern __shared__ int8_t shared_data[];
-  int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = {
-      shared_data, shared_data + shmem_used_per_block / 2};
+  int8_t *shared_tiles[NUM_VALIDITY_TILES_PER_KERNEL_LOADED] = {
+      shared_data, shared_data + shmem_used_per_tile / 2};
 
   using cudf::detail::warp_size;
 
@@ -809,55 +911,57 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ
   // probably need knobs for number of rows vs columns to balance read/write
   auto group = cooperative_groups::this_thread_block();
 
-  int const blocks_remaining =
-      std::min(static_cast<uint>(block_infos.size()) - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL,
-               static_cast<uint>(NUM_VALIDITY_BLOCKS_PER_KERNEL));
+  int const tiles_remaining =
+      std::min(static_cast<uint>(tile_infos.size()) - blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL,
+               static_cast<uint>(NUM_VALIDITY_TILES_PER_KERNEL));
 
   __shared__ cuda::barrier<cuda::thread_scope_block>
-      shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+      shared_tile_barriers[NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
   if (group.thread_rank() == 0) {
-    for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) {
-      init(&shared_block_barriers[i], group.size());
+    for (int i = 0; i < NUM_VALIDITY_TILES_PER_KERNEL_LOADED; ++i) {
+      init(&shared_tile_barriers[i], group.size());
     }
   }
 
   group.sync();
 
-  for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) {
-    if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) {
-      auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED;
-      shared_block_barriers[validity_index].arrive_and_wait();
+  for (int validity_tile = 0; validity_tile < tiles_remaining; ++validity_tile) {
+    if (validity_tile >= NUM_VALIDITY_TILES_PER_KERNEL_LOADED) {
+      auto const validity_index = validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED;
+      shared_tile_barriers[validity_index].arrive_and_wait();
     }
-    int8_t *this_shared_block = shared_blocks[validity_block % 2];
-    auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block];
-    auto const block_start_col = block.start_col;
-    auto const block_start_row = block.start_row;
-    auto const num_block_cols = block.num_cols();
-    auto const num_block_rows = block.num_rows();
-    auto const num_sections_x = util::div_rounding_up_safe(num_block_cols, 8);
-    auto const num_sections_y = util::div_rounding_up_safe(num_block_rows, 32);
+    int8_t *this_shared_tile = shared_tiles[validity_tile % 2];
+    auto const tile = tile_infos[blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL + validity_tile];
+    auto const tile_start_col = tile.start_col;
+    auto const tile_start_row = tile.start_row;
+    auto const num_tile_cols = tile.num_cols();
+    auto const num_tile_rows = tile.num_rows();
+    auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, 8);
+    auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, 32);
     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections = num_sections_x * num_sections_y;
     int const warp_id = threadIdx.x / warp_size;
     int const lane_id = threadIdx.x % warp_size;
-    auto const warps_per_block = std::max(1u, blockDim.x / warp_size);
+    auto const warps_per_tile = std::max(1u, blockDim.x / warp_size);
 
-    // the block is divided into sections. A warp operates on a section at a time.
+    // the tile is divided into sections. A warp operates on a section at a time.
     for (int my_section_idx = warp_id; my_section_idx < total_sections;
-         my_section_idx += warps_per_block) {
+         my_section_idx += warps_per_tile) {
       // convert section to row and col
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
       auto const relative_col = section_x * 8;
       auto const relative_row = section_y * 32 + lane_id;
-      auto const absolute_col = relative_col + block_start_col;
-      auto const absolute_row = relative_row + block_start_row;
+      auto const absolute_col = relative_col + tile_start_col;
+      auto const absolute_row = relative_row + tile_start_row;
+      auto const row_batch_start =
+          tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
 
       auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
 
       if (absolute_row < num_rows) {
-        auto const my_byte =
-            input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8];
+        auto const my_byte = input_data[row_offsets(absolute_row, row_batch_start) +
+                                        validity_offset + absolute_col / 8];
 
         // so every thread that is participating in the warp has a byte, but it's row-based
         // data and we need it in column-based. So we shuffle the bits around to make
@@ -870,47 +974,47 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ
             auto const validity_write_offset =
                 validity_data_col_length * (relative_col + i) + relative_row / 8;
 
-            *reinterpret_cast<int32_t *>(&this_shared_block[validity_write_offset]) = validity_data;
+            *reinterpret_cast<int32_t *>(&this_shared_tile[validity_write_offset]) = validity_data;
           }
         }
       }
     }
 
-    // make sure entire block has finished copy
+    // make sure entire tile has finished copy
     group.sync();
 
     // now async memcpy the shared memory out to the final destination 8 bytes at a time
-    auto const col_bytes = util::div_rounding_up_unsafe(num_block_rows, 8);
+    auto const col_bytes = util::div_rounding_up_unsafe(num_tile_rows, 8);
     auto const chunks_per_col = util::div_rounding_up_unsafe(col_bytes, 8);
-    auto const total_chunks = chunks_per_col * num_block_cols;
-    auto &subset_barrier =
-        shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED];
+    auto const total_chunks = chunks_per_col * num_tile_cols;
+    auto &processing_barrier =
+        shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
     auto const tail_bytes = col_bytes % 8;
 
     for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
       // determine source address of my chunk
       auto const relative_col = i / chunks_per_col;
       auto const row_chunk = i % chunks_per_col;
-      auto const absolute_col = relative_col + block_start_col;
+      auto const absolute_col = relative_col + tile_start_col;
       auto const relative_chunk_byte_offset = row_chunk * 8;
-      auto const output_dest =
-          output_nm[absolute_col] + word_index(block_start_row) + row_chunk * 2;
+      auto const output_dest = output_nm[absolute_col] + word_index(tile_start_row) + row_chunk * 2;
       auto const input_src =
-          &this_shared_block[validity_data_col_length * relative_col + relative_chunk_byte_offset];
+          &this_shared_tile[validity_data_col_length * relative_col + relative_chunk_byte_offset];
 
-      if (tail_bytes > 0 && row_chunk == chunks_per_col - 1)
-        cuda::memcpy_async(output_dest, input_src, tail_bytes, subset_barrier);
-      else
-        cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier);
+      if (tail_bytes > 0 && row_chunk == chunks_per_col - 1) {
+        cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier);
+      } else {
+        cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier);
+      }
     }
   }
 
-  // wait for last blocks of data to arrive
-  auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ?
-                                      NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED :
-                                      blocks_remaining;
-  for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) {
-    shared_block_barriers[validity_block].arrive_and_wait();
+  // wait for last tiles of data to arrive
+  auto const num_tiles_to_wait = tiles_remaining > NUM_VALIDITY_TILES_PER_KERNEL_LOADED ?
+                                     NUM_VALIDITY_TILES_PER_KERNEL_LOADED :
+                                     tiles_remaining;
+  for (int validity_tile = 0; validity_tile < num_tiles_to_wait; ++validity_tile) {
+    shared_tile_barriers[validity_tile].arrive_and_wait();
   }
 }
 
@@ -1087,19 +1191,19 @@ static size_type compute_column_information(iterator begin, iterator end,
 }
 
 /**
- * @brief Build `block_info` for the validity data to break up the work.
+ * @brief Build `tile_info` for the validity data to break up the work.
  *
  * @param num_columns number of columns in the table
  * @param num_rows number of rows in the table
- * @param shmem_limit_per_block size of shared memory available to a single gpu block
+ * @param shmem_limit_per_tile size of shared memory available to a single gpu tile
  * @param row_batches batched row information for multiple output locations
- * @return vector of `block_info` structs for validity data
+ * @return vector of `tile_info` structs for validity data
  */
-std::vector<detail::block_info>
-build_validity_block_infos(size_type const &num_columns, size_type const &num_rows,
-                           size_type const &shmem_limit_per_block,
-                           std::vector<row_batch> const &row_batches) {
-  auto const desired_rows_and_columns = static_cast<int>(sqrt(shmem_limit_per_block));
+std::vector<detail::tile_info>
+build_validity_tile_infos(size_type const &num_columns, size_type const &num_rows,
+                          size_type const &shmem_limit_per_tile,
+                          std::vector<row_batch> const &row_batches) {
+  auto const desired_rows_and_columns = static_cast<int>(sqrt(shmem_limit_per_tile));
   auto const column_stride = util::round_up_unsafe(
       [&]() {
         if (desired_rows_and_columns > num_columns) {
@@ -1116,28 +1220,29 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro
   // element still takes 8 bytes!
   auto const bytes_per_row =
       util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), JCUDF_ROW_ALIGNMENT);
-  auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row);
+  auto const row_stride =
+      std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64));
 
-  std::vector<detail::block_info> validity_block_infos;
+  std::vector<detail::tile_info> validity_tile_infos;
   for (int col = 0; col < num_columns; col += column_stride) {
-    int current_window_row_batch = 0;
-    int rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+    int current_tile_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_tile_row_batch].row_count;
     int row = 0;
     while (row < num_rows) {
       if (rows_left_in_batch == 0) {
-        current_window_row_batch++;
-        rows_left_in_batch = row_batches[current_window_row_batch].row_count;
+        current_tile_row_batch++;
+        rows_left_in_batch = row_batches[current_tile_row_batch].row_count;
       }
-      int const window_height = std::min(row_stride, rows_left_in_batch);
+      int const tile_height = std::min(row_stride, rows_left_in_batch);
 
-      validity_block_infos.emplace_back(detail::block_info{
-          col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1});
-      row += window_height;
-      rows_left_in_batch -= window_height;
+      validity_tile_infos.emplace_back(detail::tile_info{
+          col, row, std::min(col + column_stride - 1, num_columns - 1), row + tile_height - 1});
+      row += tile_height;
+      rows_left_in_batch -= tile_height;
     }
   }
 
-  return validity_block_infos;
+  return validity_tile_infos;
 }
 
 template <typename RowSize> struct row_size_functor {
@@ -1155,6 +1260,7 @@ template <typename RowSize> struct row_size_functor {
  * @tparam RowSize iterator that gives the size of a specific row of the table.
  * @param num_rows Total number of rows in the table
  * @param row_sizes iterator that gives the size of a specific row of the table.
+ * @param all_fixed_width bool indicating all data in this table is fixed width
  * @param stream stream to operate on for this work
  * @param mr memory resource used to allocate any returned data
  * @returns vector of size_type's that indicate row numbers for batch boundaries and a
@@ -1162,8 +1268,8 @@ template <typename RowSize> struct row_size_functor {
  */
 
 template <typename RowSize>
-batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource *mr) {
+batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_width,
+                         rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   auto uint64_row_sizes =
       cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes));
   auto const total_size =
@@ -1173,7 +1279,7 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream
   auto const num_offsets = num_batches + 1;
   std::vector<row_batch> row_batches;
   std::vector<size_type> batch_row_boundaries;
-  device_uvector<size_type> batch_row_offsets(num_rows, stream);
+  device_uvector<size_type> batch_row_offsets(all_fixed_width ? 0 : num_rows, stream);
 
   // at most max gpu memory / 2GB iterations.
   batch_row_boundaries.reserve(num_offsets);
@@ -1212,8 +1318,10 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream
     // The output_batch_row_offsets vector is used as the offset column of the returned data. This
     // needs to be individually allocated, but the kernel needs a contiguous array of offsets or
     // more global lookups are necessary.
-    cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(),
-               num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice);
+    if (!all_fixed_width) {
+      cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(),
+                 num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice);
+    }
 
     batch_row_boundaries.push_back(row_end);
     row_batches.push_back({batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)});
@@ -1221,141 +1329,142 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream
     last_row_end = row_end;
   }
 
-  return {std::move(batch_row_offsets), batch_row_boundaries, std::move(row_batches)};
+  return {std::move(batch_row_offsets), make_device_uvector_async(batch_row_boundaries, stream),
+          std::move(batch_row_boundaries), std::move(row_batches)};
 }
 
 /**
- * @brief Computes the number of blocks necessary given a window height and batch offsets
+ * @brief Computes the number of tiles necessary given a tile height and batch offsets
  *
  * @param batch_row_boundaries row boundaries for each batch
- * @param desired_window_height height of each window in the table
+ * @param desired_tile_height height of each tile in the table
  * @param stream stream to use
- * @return number of windows necessary
+ * @return number of tiles necessary
  */
-int compute_block_counts(device_span<size_type const> const &batch_row_boundaries,
-                         int desired_window_height, rmm::cuda_stream_view stream) {
+int compute_tile_counts(device_span<size_type const> const &batch_row_boundaries,
+                        int desired_tile_height, rmm::cuda_stream_view stream) {
   size_type const num_batches = batch_row_boundaries.size() - 1;
-  device_uvector<size_type> num_blocks(num_batches, stream);
+  device_uvector<size_type> num_tiles(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
-                    [desired_window_height,
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(),
+                    [desired_tile_height,
                      batch_row_boundaries =
                          batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
                       return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
                                                               batch_row_boundaries[batch_index],
-                                                          desired_window_height);
+                                                          desired_tile_height);
                     });
-  return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+  return thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
 }
 
 /**
- * @brief Builds the `block_info` structs for a given table.
+ * @brief Builds the `tile_info` structs for a given table.
  *
- * @param blocks span of blocks to populate
+ * @param tiles span of tiles to populate
  * @param batch_row_boundaries boundary to row batches
- * @param column_start starting column of the window
- * @param column_end ending column of the window
- * @param desired_window_height height of the window
+ * @param column_start starting column of the tile
+ * @param column_end ending column of the tile
+ * @param desired_tile_height height of the tile
  * @param total_number_of_rows total number of rows in the table
  * @param stream stream to use
- * @return number of windows created
+ * @return number of tiles created
  */
 size_type
-build_blocks(device_span<block_info> blocks,
-             device_uvector<size_type> const &batch_row_boundaries, // comes from build_batches
-             int column_start, int column_end, int desired_window_height, int total_number_of_rows,
-             rmm::cuda_stream_view stream) {
+build_tiles(device_span<tile_info> tiles,
+            device_uvector<size_type> const &batch_row_boundaries, // comes from build_batches
+            int column_start, int column_end, int desired_tile_height, int total_number_of_rows,
+            rmm::cuda_stream_view stream) {
   size_type const num_batches = batch_row_boundaries.size() - 1;
-  device_uvector<size_type> num_blocks(num_batches, stream);
+  device_uvector<size_type> num_tiles(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(),
-                    [desired_window_height,
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(),
+                    [desired_tile_height,
                      batch_row_boundaries =
                          batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
                       return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
                                                               batch_row_boundaries[batch_index],
-                                                          desired_window_height);
+                                                          desired_tile_height);
                     });
 
-  size_type const total_blocks =
-      thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end());
+  size_type const total_tiles =
+      thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
 
-  device_uvector<size_type> block_starts(num_batches + 1, stream);
-  auto block_iter = cudf::detail::make_counting_transform_iterator(
-      0, [num_blocks = num_blocks.data(), num_batches] __device__(auto i) {
-        return (i < num_batches) ? num_blocks[i] : 0;
+  device_uvector<size_type> tile_starts(num_batches + 1, stream);
+  auto tile_iter = cudf::detail::make_counting_transform_iterator(
+      0, [num_tiles = num_tiles.data(), num_batches] __device__(auto i) {
+        return (i < num_batches) ? num_tiles[i] : 0;
       });
-  thrust::exclusive_scan(rmm::exec_policy(stream), block_iter, block_iter + num_batches + 1,
-                         block_starts.begin()); // in blocks
+  thrust::exclusive_scan(rmm::exec_policy(stream), tile_iter, tile_iter + num_batches + 1,
+                         tile_starts.begin()); // in tiles
 
   thrust::transform(
-      rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(),
-      [=, block_starts = block_starts.data(),
-       batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type block_index) {
-        // what batch this block falls in
+      rmm::exec_policy(stream), iter, iter + total_tiles, tiles.begin(),
+      [=, tile_starts = tile_starts.data(),
+       batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) {
+        // what batch this tile falls in
         auto const batch_index_iter =
-            thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index);
-        auto const batch_index = std::distance(block_starts, batch_index_iter) - 1;
-        // local index within the block
-        int const local_block_index = block_index - block_starts[batch_index];
+            thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index);
+        auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1;
+        // local index within the tile
+        int const local_tile_index = tile_index - tile_starts[batch_index];
         // the start row for this batch.
         int const batch_row_start = batch_row_boundaries[batch_index];
-        // the start row for this block
-        int const block_row_start = batch_row_start + (local_block_index * desired_window_height);
-        // the end row for this block
+        // the start row for this tile
+        int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height);
+        // the end row for this tile
         int const max_row =
             std::min(total_number_of_rows - 1,
                      batch_index + 1 > num_batches ?
                          std::numeric_limits<size_type>::max() :
                          static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
-        int const block_row_end = std::min(
-            batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row);
+        int const tile_row_end =
+            std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row);
 
-        // stuff the block
-        return block_info{column_start, block_row_start, column_end, block_row_end,
-                          static_cast<int>(batch_index)};
+        // stuff the tile
+        return tile_info{column_start, tile_row_start, column_end, tile_row_end,
+                         static_cast<int>(batch_index)};
       });
 
-  return total_blocks;
+  return total_tiles;
 }
 
 /**
- * @brief Determines what data should be operated on by each block for the incoming table.
+ * @brief Determines what data should be operated on by each tile for the incoming table.
  *
- * @tparam WindowCallback Callback that receives the start and end columns of windows
+ * @tparam TileCallback Callback that receives the start and end columns of tiles
  * @param column_sizes vector of the size of each column
  * @param column_starts vector of the offset of each column
- * @param first_row_batch_size size of the first row batch to limit max window size since a window
+ * @param first_row_batch_size size of the first row batch to limit max tile size since a tile
  * is unable to span batches
  * @param total_number_of_rows total number of rows in the table
- * @param shmem_limit_per_block shared memory allowed per block
- * @param f callback function called when building a window
+ * @param shmem_limit_per_tile shared memory allowed per tile
+ * @param f callback function called when building a tile
  */
-template <typename WindowCallback>
-void determine_windows(std::vector<size_type> const &column_sizes,
-                       std::vector<size_type> const &column_starts,
-                       size_type const first_row_batch_size, size_type const total_number_of_rows,
-                       size_type const &shmem_limit_per_block, WindowCallback f) {
-  // block infos are organized with the windows going "down" the columns
+template <typename TileCallback>
+void determine_tiles(std::vector<size_type> const &column_sizes,
+                     std::vector<size_type> const &column_starts,
+                     size_type const first_row_batch_size, size_type const total_number_of_rows,
+                     size_type const &shmem_limit_per_tile, TileCallback f) {
+  // tile infos are organized with the tile going "down" the columns
   // this provides the most coalescing of memory access
-  int current_window_width = 0;
-  int current_window_start_col = 0;
+  int current_tile_width = 0;
+  int current_tile_start_col = 0;
 
-  // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write
-  // would be memory cache line sized access, but since other blocks will read/write the edges
+  // the ideal tile height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+  // would be memory cache line sized access, but since other tiles will read/write the edges
   // this may not turn out to be overly important. For now, we will attempt to build a square
-  // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
+  // tile as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
   // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
   // trick is that it's in bytes, not rows or columns.
-  auto const optimal_square_len = static_cast<size_type>(sqrt(shmem_limit_per_block));
-  auto const window_height =
+  auto const optimal_square_len = static_cast<size_type>(sqrt(shmem_limit_per_tile));
+  auto const tile_height =
       std::clamp(util::round_up_safe<int>(
                      std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32),
                  1, first_row_batch_size);
 
   int row_size = 0;
 
-  // march each column and build the blocks of appropriate sizes
+  // march each column and build the tiles of appropriate sizes
   for (uint col = 0; col < column_sizes.size(); ++col) {
     auto const col_size = column_sizes[col];
 
@@ -1366,25 +1475,25 @@ void determine_windows(std::vector<size_type> const &column_sizes,
     auto const row_size_with_end_pad =
         util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT);
 
-    if (row_size_with_end_pad * window_height > shmem_limit_per_block) {
-      // too large, close this window, generate vertical blocks and restart
-      f(current_window_start_col, col == 0 ? col : col - 1, window_height);
+    if (row_size_with_end_pad * tile_height > shmem_limit_per_tile) {
+      // too large, close this tile, generate vertical tiles and restart
+      f(current_tile_start_col, col == 0 ? col : col - 1, tile_height);
 
       row_size =
           util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-      row_size += col_size; // alignment required for shared memory window boundary to match
+      row_size += col_size; // alignment required for shared memory tile boundary to match
                             // alignment of output row
-      current_window_start_col = col;
-      current_window_width = 0;
+      current_tile_start_col = col;
+      current_tile_width = 0;
     } else {
       row_size = row_size_with_this_col;
-      current_window_width++;
+      current_tile_width++;
     }
   }
 
-  // build last set of blocks
-  if (current_window_width > 0) {
-    f(current_window_start_col, static_cast<int>(column_sizes.size()) - 1, window_height);
+  // build last set of tiles
+  if (current_tile_width > 0) {
+    f(current_tile_start_col, static_cast<int>(column_sizes.size()) - 1, tile_height);
   }
 }
 
@@ -1399,18 +1508,23 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
   auto const num_columns = tbl.num_columns();
   auto const num_rows = tbl.num_rows();
 
+  auto const fixed_width_only = std::all_of(
+      tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); });
+
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
-  int total_shmem;
-  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
-
-  // TODO: why is this needed. kernel fails to launch if all memory is requested.
-  total_shmem -= 1024;
-  auto const shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
-
-  // break up the work into blocks, which are a starting and ending row/col #.
-  // this window size is calculated based on the shared memory size available
-  // we want a single block to fill up the entire shared memory space available
+  int total_shmem_in_bytes;
+  CUDA_TRY(
+      cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
+  total_shmem_in_bytes -=
+      sizeof(cuda::barrier<cuda::thread_scope_block>) * NUM_TILES_PER_KERNEL_LOADED;
+  auto const shmem_limit_per_tile = total_shmem_in_bytes / NUM_TILES_PER_KERNEL_LOADED;
+
+  // break up the work into tiles, which are a starting and ending row/col #.
+  // this tile size is calculated based on the shared memory size available
+  // we want a single tile to fill up the entire shared memory space available
   // for the transpose-like conversion.
 
   // There are two different processes going on here. The GPU conversion of the data
@@ -1419,19 +1533,19 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
   // this limitation because the column must own the data inside and as a result it must be
   // a distinct allocation for that column. Copying the data into these final buffers would
   // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
-  // The windows are broken at the boundaries of specific rows based on the row sizes up
+  // The tiles are broken at the boundaries of specific rows based on the row sizes up
   // to that point. These are row batches and they are decided first before building the
-  // windows so the windows can be properly cut around them.
+  // tiles so the tiles can be properly cut around them.
 
   // Get the pointers to the input columnar data ready
-  std::vector<int8_t const *> input_data;
-  std::vector<bitmask_type const *> input_nm;
-  input_data.reserve(num_columns);
-  input_nm.reserve(num_columns);
-  std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_data),
-                 [](column_view const &c) -> int8_t const * { return c.template data<int8_t>(); });
-  std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_nm),
-                 [](auto c) { return c.null_mask(); });
+
+  auto data_begin = thrust::make_transform_iterator(
+      tbl.begin(), [](auto const &c) { return c.template data<int8_t>(); });
+  std::vector<int8_t const *> input_data(data_begin, data_begin + tbl.num_columns());
+
+  auto nm_begin =
+      thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); });
+  std::vector<bitmask_type const *> input_nm(nm_begin, nm_begin + tbl.num_columns());
 
   auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
   auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
@@ -1444,7 +1558,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
   auto schema_column_iter =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                                       [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-                                        return std::make_tuple(tbl.column(i).type(), tbl.column(i));
+                                        return {tbl.column(i).type(), tbl.column(i)};
                                       });
 
   auto const fixed_width_size_per_row = detail::compute_column_information(
@@ -1461,9 +1575,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
         return util::round_up_unsafe(bytes_needed, JCUDF_ROW_ALIGNMENT);
       });
 
-  auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr);
-  auto gpu_batch_row_boundaries =
-      make_device_uvector_async(batch_info.batch_row_boundaries, stream);
+  auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
 
   // the first batch always exists unless we were sent an empty table
   auto const first_batch_size = batch_info.row_batches[0].row_count;
@@ -1479,52 +1591,67 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
   std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data),
                  [](auto &buf) { return static_cast<int8_t *>(buf.data()); });
 
+  /*   auto output_data_begin = thrust::make_transform_iterator(batch_info.row_batches.begin(),
+      [stream, mr](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); });
+      std::vector<rmm::device_buffer> output_buffers( output_data_begin, output_data_begin +
+     batch_info.row_batches.size() );
+
+      auto output_buffers_begin = thrust::make_transform_iterator(output_buffers.begin(),
+        [](auto const &buf) -> int8_t * { return static_cast<int8_t *>(buf.data()); });
+      std::vector<int8_t *> output_data( output_buffers_begin, output_buffers_begin +
+     output_buffers.size() );*/
+
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   int info_count = 0;
-  detail::determine_windows(
-      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block,
-      [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col,
-                                                        int const window_height) {
-        int i = detail::compute_block_counts(gpu_batch_row_boundaries, window_height, stream);
+  detail::determine_tiles(
+      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_tile,
+      [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count,
+       &stream](int const start_col, int const end_col, int const tile_height) {
+        int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream);
         info_count += i;
       });
 
-  // allocate space for blocks
-  device_uvector<detail::block_info> gpu_block_infos(info_count, stream);
-  int block_offset = 0;
-
-  detail::determine_windows(
-      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block,
-      [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset,
-       stream](int const start_col, int const end_col, int const window_height) {
-        block_offset += detail::build_blocks(
-            {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset},
-            gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream);
+  // allocate space for tiles
+  device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
+  int tile_offset = 0;
+
+  detail::determine_tiles(
+      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_tile,
+      [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &gpu_tile_infos, num_rows,
+       &tile_offset, stream](int const start_col, int const end_col, int const tile_height) {
+        tile_offset += detail::build_tiles(
+            {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
+            gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream);
       });
 
   // blast through the entire table and convert it
-  dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS));
+  dim3 blocks(util::div_rounding_up_unsafe(gpu_tile_infos.size(), NUM_TILES_PER_KERNEL_TO_ROWS));
   dim3 threads(256);
 
-  auto validity_block_infos = detail::build_validity_block_infos(
-      num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches);
+  auto validity_tile_infos = detail::build_validity_tile_infos(
+      num_columns, num_rows, shmem_limit_per_tile, batch_info.row_batches);
 
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream);
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
-  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
-
-  detail::copy_to_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(),
-      dev_col_sizes.data(), dev_col_starts.data(),
-      batch_info.batch_row_offsets
-          .data(), // needs to be row offsets per batch, not overall JUST for output.
+      util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL));
+  dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu));
+
+  auto const fixed_width_only_row_size = util::round_up_unsafe(
+      fixed_width_size_per_row + util::div_rounding_up_safe(num_columns, 8), 8);
+  detail::row_offset_functor offset_functor(fixed_width_only_row_size);
+
+  detail::copy_to_rows<<<blocks, threads, total_shmem_in_bytes, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(),
+      dev_col_sizes.data(), dev_col_starts.data(), offset_functor,
+      batch_info.d_batch_row_boundaries.data(),
       reinterpret_cast<int8_t **>(dev_output_data.data()));
 
-  detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, batch_info.batch_row_offsets.data(),
-      dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data());
+  detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem_in_bytes,
+                                  stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_tile, offset_functor,
+      batch_info.d_batch_row_boundaries.data(), dev_output_data.data(), column_starts.back(),
+      dev_validity_tile_infos, dev_input_nm.data());
 
   // split up the output buffer into multiple buffers based on row batch sizes
   // and create list of byte columns
@@ -1629,12 +1756,14 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
 
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
-  int total_shmem;
-  CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+  int total_shmem_in_bytes;
+  CUDA_TRY(
+      cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-  // TODO: why is this needed. kernel fails to launch if all memory is requested.
-  total_shmem -= 1024;
-  int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED;
+  // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
+  total_shmem_in_bytes -=
+      sizeof(cuda::barrier<cuda::thread_scope_block>) * NUM_TILES_PER_KERNEL_LOADED;
+  int shmem_limit_per_tile = total_shmem_in_bytes / NUM_TILES_PER_KERNEL_LOADED;
 
   std::vector<size_type> column_starts;
   std::vector<size_type> column_sizes;
@@ -1686,50 +1815,53 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
                     [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; });
 
   int info_count = 0;
-  detail::determine_windows(column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block,
-                            [&gpu_batch_row_boundaries, &info_count, &stream](
-                                int const start_col, int const end_col, int const window_height) {
-                              info_count += detail::compute_block_counts(gpu_batch_row_boundaries,
-                                                                         window_height, stream);
-                            });
-
-  // allocate space for blocks
-  device_uvector<detail::block_info> gpu_block_infos(info_count, stream);
-
-  int block_offset = 0;
-  detail::determine_windows(
-      column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block,
-      [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset,
-       stream](int const start_col, int const end_col, int const window_height) {
-        block_offset += detail::build_blocks(
-            {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset},
-            gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream);
+  detail::determine_tiles(column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_tile,
+                          [&gpu_batch_row_boundaries, &info_count,
+                           &stream](int const start_col, int const end_col, int const tile_height) {
+                            info_count += detail::compute_tile_counts(gpu_batch_row_boundaries,
+                                                                      tile_height, stream);
+                          });
+
+  // allocate space for tiles
+  device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
+
+  int tile_offset = 0;
+  detail::determine_tiles(
+      column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_tile,
+      [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset,
+       stream](int const start_col, int const end_col, int const tile_height) {
+        tile_offset += detail::build_tiles(
+            {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
+            gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream);
       });
 
-  dim3 blocks(
-      util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS));
-  dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), static_cast<int>(child.size())));
+  dim3 blocks(util::div_rounding_up_unsafe(gpu_tile_infos.size(), NUM_TILES_PER_KERNEL_FROM_ROWS));
+  dim3 threads(std::min(std::min(256, shmem_limit_per_tile / 8), static_cast<int>(child.size())));
 
-  auto validity_block_infos =
-      detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches);
+  auto validity_tile_infos =
+      detail::build_validity_tile_infos(num_columns, num_rows, shmem_limit_per_tile, row_batches);
 
-  auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream);
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
 
   dim3 validity_blocks(
-      util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL));
+      util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL));
+
+  dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu));
 
-  dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu));
+  auto const fixed_width_only_row_size = util::round_up_unsafe(
+      fixed_width_size_per_row + util::div_rounding_up_safe(static_cast<size_type>(num_columns), 8),
+      8);
+  detail::row_offset_functor offset_functor(fixed_width_only_row_size);
 
-  detail::copy_from_rows<<<blocks, threads, total_shmem, stream.value()>>>(
-      num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos,
+  detail::copy_from_rows<<<blocks, threads, total_shmem_in_bytes, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(),
+      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_tile_infos,
       child.data<int8_t>());
 
-  detail::
-      copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem, stream.value()>>>(
-          num_rows, num_columns, shmem_limit_per_block, input.offsets().data<size_type>(),
-          dev_output_nm.data(), column_starts.back(), dev_validity_block_infos,
-          child.data<int8_t>());
+  detail::copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem_in_bytes,
+                                    stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(),
+      dev_output_nm.data(), column_starts.back(), dev_validity_tile_infos, child.data<int8_t>());
 
   return std::make_unique<table>(std::move(output_columns));
 #else
@@ -1794,6 +1926,6 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
   }
 }
 
-} // namespace java
+} // namespace jni
 
 } // namespace cudf

From 7fbe10dbb55873fd0ac03706dd283f7c5ca90229 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Thu, 23 Dec 2021 02:14:39 +0000
Subject: [PATCH 73/80] removing commented out code

---
 java/src/main/native/src/row_conversion.cu | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index c1b6bdbce5d..9df8d7b7f14 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1591,16 +1591,6 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
   std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data),
                  [](auto &buf) { return static_cast<int8_t *>(buf.data()); });
 
-  /*   auto output_data_begin = thrust::make_transform_iterator(batch_info.row_batches.begin(),
-      [stream, mr](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); });
-      std::vector<rmm::device_buffer> output_buffers( output_data_begin, output_data_begin +
-     batch_info.row_batches.size() );
-
-      auto output_buffers_begin = thrust::make_transform_iterator(output_buffers.begin(),
-        [](auto const &buf) -> int8_t * { return static_cast<int8_t *>(buf.data()); });
-      std::vector<int8_t *> output_data( output_buffers_begin, output_buffers_begin +
-     output_buffers.size() );*/
-
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   int info_count = 0;

From d47360d94e9054df002c94432f630a4f90c0d084 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 4 Jan 2022 16:48:03 +0000
Subject: [PATCH 74/80] updating from review comments

---
 .../cudf/detail/utilities/integer_utils.hpp   | 61 +++++++++++++------
 java/src/main/java/ai/rapids/cudf/Table.java  |  9 ---
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 0e427c0418a..fe501279fd5 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,12 +33,18 @@ namespace cudf {
 //! Utility functions
 namespace util {
 /**
- * Finds the smallest integer not less than `number_to_round` and modulo `S` is
- * zero. This function assumes that `number_to_round` is non-negative and
- * `modulus` is positive.
+ * @brief Rounds `number_to_round` up to the next multiple of modulus
+ *
+ * @tparam S type to return
+ * @param number_to_round number that is being rounded
+ * @param modulus value to which to round
+ * @return smallest integer greater than `number_to_round` and modulo `S` is zero.
+ *
+ * @note This function assumes that `number_to_round` is non-negative and
+ * `modulus` is positive. The safety is in regard to rollover.
  */
 template <typename S>
-inline S round_up_safe(S number_to_round, S modulus)
+S round_up_safe(S number_to_round, S modulus)
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -50,20 +56,37 @@ inline S round_up_safe(S number_to_round, S modulus)
 }
 
 /**
- * Finds the largest integer not greater than `number_to_round` and modulo `S` is
- * zero. This function assumes that `number_to_round` is non-negative and
- * `modulus` is positive.
+ * @brief Rounds `number_to_round` down to the last multiple of modulus
+ *
+ * @tparam S type to return
+ * @param number_to_round number that is being rounded
+ * @param modulus value to which to round
+ * @return largest integer not greater than `number_to_round` and modulo `S` is zero.
+ *
+ * @note This function assumes that `number_to_round` is non-negative and
+ * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-inline S round_down_safe(S number_to_round, S modulus)
+S round_down_safe(S number_to_round, S modulus) noexcept
 {
   auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
   return rounded_down;
 }
 
+/**
+ * @brief Rounds `number_to_round` up to the next multiple of modulus
+ *
+ * @tparam S type to return
+ * @param number_to_round number that is being rounded
+ * @param modulus value to which to round
+ * @return smallest integer greater than `number_to_round` and modulo `S` is zero.
+ *
+ * @note This function assumes that `number_to_round` is non-negative and
+ * `modulus` is positive and does not check for overflow.
+ */
 template <typename S>
-constexpr inline S round_up_unsafe(S number_to_round, S modulus) noexcept
+constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -84,16 +107,16 @@ constexpr inline S round_up_unsafe(S number_to_round, S modulus) noexcept
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
+constexpr S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
 {
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
-constexpr inline I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                        I dividend,
-                                        I divisor) noexcept
+constexpr I div_rounding_up_safe(std::integral_constant<bool, false>,
+                                 I dividend,
+                                 I divisor) noexcept
 {
   // TODO: This could probably be implemented faster
   return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
@@ -101,9 +124,7 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, false>,
 }
 
 template <typename I>
-constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
-                                        I dividend,
-                                        I divisor) noexcept
+constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend, I divisor) noexcept
 {
   auto quotient  = dividend / divisor;
   auto remainder = dividend % divisor;
@@ -125,14 +146,14 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline I div_rounding_up_safe(I dividend, I divisor) noexcept
+constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
 {
   using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline bool is_a_power_of_two(I val) noexcept
+constexpr bool is_a_power_of_two(I val) noexcept
 {
   static_assert(std::is_integral<I>::value, "This function only applies to integral types");
   return ((val - 1) & val) == 0;
@@ -162,7 +183,7 @@ constexpr inline bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-constexpr inline auto absolute_value(T value) -> T
+constexpr auto absolute_value(T value) -> T
 {
   if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 06383c82ae6..6c34fd6f997 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -2789,15 +2789,6 @@ public ColumnVector[] convertToRowsFixedWidthOptimized() {
     return ret;
   }
 
-  public ColumnVector[] convertToRowsFixedWidthOptimized() {
-    long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle);
-    ColumnVector[] ret = new ColumnVector[ptrs.length];
-    for (int i = 0; i < ptrs.length; i++) {
-      ret[i] = new ColumnVector(ptrs[i]);
-    }
-    return ret;
-  }
-
   /**
    * Convert a column of list of bytes that is formatted like the output from `convertToRows`
    * and convert it back to a table.

From 9b502718ad7d4fa814d5a78854715d04c3983e61 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 4 Jan 2022 18:45:20 +0000
Subject: [PATCH 75/80] Updating namespace

---
 java/src/main/native/src/TableJni.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 44c34d133ce..a3a00730f30 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2754,7 +2754,7 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols =
-        cudf::java::convert_to_rows_fixed_width_optimized(*n_input_table);
+        cudf::jni::convert_to_rows_fixed_width_optimized(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2812,7 +2812,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   try {
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::java::convert_to_rows(*n_input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::jni::convert_to_rows(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2839,7 +2839,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
     std::unique_ptr<cudf::table> result =
-        cudf::java::convert_from_rows_fixed_width_optimized(list_input, types_vec);
+        cudf::jni::convert_from_rows_fixed_width_optimized(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
@@ -2862,7 +2862,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::unique_ptr<cudf::table> result = cudf::java::convert_from_rows(list_input, types_vec);
+    std::unique_ptr<cudf::table> result = cudf::jni::convert_from_rows(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);

From fb7566cfef456c43f512c0ecb0731aed15b8e10b Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Tue, 4 Jan 2022 18:55:25 +0000
Subject: [PATCH 76/80] updating namespace

---
 java/src/main/native/src/row_conversion.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
index edc2768d4bb..181a9fa068d 100644
--- a/java/src/main/native/src/row_conversion.hpp
+++ b/java/src/main/native/src/row_conversion.hpp
@@ -23,7 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
-namespace java {
+namespace jni {
 
 std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
     cudf::table_view const &tbl,

From 5e1cf972552d6b53041439f1bf185b5a8f3aa403 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Wed, 5 Jan 2022 21:27:43 -0500
Subject: [PATCH 77/80] Update java/src/main/native/src/row_conversion.cu

Co-authored-by: MithunR <mythrocks@gmail.com>
---
 java/src/main/native/src/row_conversion.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 9df8d7b7f14..9b60fb667b6 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -106,7 +106,7 @@ namespace detail {
  * NUM_TILES_PER_KERNEL_LOADED tiles at one time. The block will load
  * as many tiles as it can fit into shared memory and then wait on the
  * first tile to completely load before processing. Processing in this
- * case means coping the data from shared memory back out to device
+ * case means copying the data from shared memory back out to device
  * memory via memcpy_async. This kernel is completely memory bound.
  *
  * Batch Data:

From a1e35459e77ed1429044aea662961d2ae42c7c34 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Fri, 7 Jan 2022 05:14:28 +0000
Subject: [PATCH 78/80] moving to a constant iterator and other review cleanup

---
 java/src/main/native/src/row_conversion.cu  | 65 +++++++++------------
 java/src/main/native/src/row_conversion.hpp |  4 +-
 2 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 9df8d7b7f14..2d701497942 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -195,7 +195,7 @@ struct row_offset_functor {
       : _fixed_width_only_row_size(fixed_width_only_row_size){};
 
   CUDA_DEVICE_CALLABLE
-  size_type operator()(int row_number, int tile_row_start) {
+  size_type operator()(int row_number, int tile_row_start) const {
     return (row_number - tile_row_start) * _fixed_width_only_row_size;
   }
 
@@ -1187,7 +1187,10 @@ static size_type compute_column_information(iterator begin, iterator end,
   auto validity_offset = fixed_width_size_per_row;
   column_starts.push_back(validity_offset);
 
-  return fixed_width_size_per_row;
+  return util::round_up_unsafe(
+      fixed_width_size_per_row +
+          util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), 8),
+      JCUDF_ROW_ALIGNMENT);
 }
 
 /**
@@ -1224,6 +1227,7 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row
       std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64));
 
   std::vector<detail::tile_info> validity_tile_infos;
+  validity_tile_infos.reserve(num_columns / column_stride * num_rows / row_stride);
   for (int col = 0; col < num_columns; col += column_stride) {
     int current_tile_row_batch = 0;
     int rows_left_in_batch = row_batches[current_tile_row_batch].row_count;
@@ -1245,13 +1249,21 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row
   return validity_tile_infos;
 }
 
+/**
+ * @brief functor that returns the size of a row or 0 is row is greater than the number of rows in the table
+ *
+ * @tparam RowSize iterator that returns the size of a specific row
+ */
 template <typename RowSize> struct row_size_functor {
-  RowSize _row_sizes;
-  size_type _num_rows;
-  row_size_functor(RowSize row_sizes) : _row_sizes(row_sizes){};
+  row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end)
+      : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) {}
 
   CUDA_DEVICE_CALLABLE
-  uint64_t operator()(int row_index) { return static_cast<uint64_t>(_row_sizes[row_index]); }
+  uint64_t operator()(int i) const { return i >= _row_end ? 0 : _row_sizes[i + _last_row_end]; }
+
+  size_type _row_end;
+  RowSize _row_sizes;
+  size_type _last_row_end;
 };
 
 /**
@@ -1266,14 +1278,10 @@ template <typename RowSize> struct row_size_functor {
  * @returns vector of size_type's that indicate row numbers for batch boundaries and a
  * device_uvector of row offsets
  */
-
 template <typename RowSize>
 batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_width,
                          rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
-  auto uint64_row_sizes =
-      cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes));
-  auto const total_size =
-      thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows);
+  auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
   auto const num_batches = static_cast<int32_t>(
       util::div_rounding_up_safe(total_size, static_cast<uint64_t>(MAX_BATCH_SIZE)));
   auto const num_offsets = num_batches + 1;
@@ -1286,7 +1294,7 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
   batch_row_boundaries.push_back(0);
   size_type last_row_end = 0;
   device_uvector<uint64_t> cumulative_row_sizes(num_rows, stream);
-  thrust::inclusive_scan(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows,
+  thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
                          cumulative_row_sizes.begin());
 
   while (static_cast<int>(batch_row_boundaries.size()) < num_offsets) {
@@ -1305,10 +1313,8 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     auto const num_entries = row_end - last_row_end + 1;
     device_uvector<size_type> output_batch_row_offsets(num_entries, stream, mr);
 
-    auto row_size_iter_bounded = thrust::make_transform_iterator(
-        thrust::make_counting_iterator(0), [row_end, row_sizes, last_row_end] __device__(auto i) {
-          return i >= row_end ? 0 : row_sizes[i + last_row_end];
-        });
+    auto row_size_iter_bounded = cudf::detail::make_counting_transform_iterator(
+        0, row_size_functor(row_end, row_sizes, last_row_end));
 
     thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded,
                            row_size_iter_bounded + num_entries, output_batch_row_offsets.begin());
@@ -1568,13 +1574,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
   // total encoded row size. This includes fixed-width data, validity, and variable-width data.
-  auto row_size_iter = cudf::detail::make_counting_transform_iterator(
-      0, [fixed_width_size_per_row, num_columns] __device__(auto i) {
-        auto const bytes_needed =
-            fixed_width_size_per_row + util::div_rounding_up_safe<size_type>(num_columns, 8);
-        return util::round_up_unsafe(bytes_needed, JCUDF_ROW_ALIGNMENT);
-      });
-
+  auto row_size_iter = thrust::make_constant_iterator<uint64_t>(fixed_width_size_per_row);
   auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
 
   // the first batch always exists unless we were sent an empty table
@@ -1627,9 +1627,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
       util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL));
   dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu));
 
-  auto const fixed_width_only_row_size = util::round_up_unsafe(
-      fixed_width_size_per_row + util::div_rounding_up_safe(num_columns, 8), 8);
-  detail::row_offset_functor offset_functor(fixed_width_only_row_size);
+  detail::row_offset_functor offset_functor(fixed_width_size_per_row);
 
   detail::copy_to_rows<<<blocks, threads, total_shmem_in_bytes, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(),
@@ -1764,14 +1762,10 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   auto const fixed_width_size_per_row =
       detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
 
-  auto const validity_size = num_bitmask_words(num_columns) * 4;
-
-  auto const row_size =
-      util::round_up_unsafe(fixed_width_size_per_row + validity_size, JCUDF_ROW_ALIGNMENT);
-
   // Ideally we would check that the offsets are all the same, etc. but for now
   // this is probably fine
-  CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off");
+  CUDF_EXPECTS(fixed_width_size_per_row * num_rows == child.size(),
+               "The layout of the data appears to be off");
   auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
   auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
 
@@ -1838,10 +1832,7 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
 
   dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu));
 
-  auto const fixed_width_only_row_size = util::round_up_unsafe(
-      fixed_width_size_per_row + util::div_rounding_up_safe(static_cast<size_type>(num_columns), 8),
-      8);
-  detail::row_offset_functor offset_functor(fixed_width_only_row_size);
+  detail::row_offset_functor offset_functor(fixed_width_size_per_row);
 
   detail::copy_from_rows<<<blocks, threads, total_shmem_in_bytes, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(),
diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
index 181a9fa068d..1a3cf37caba 100644
--- a/java/src/main/native/src/row_conversion.hpp
+++ b/java/src/main/native/src/row_conversion.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,5 +47,5 @@ convert_from_rows(cudf::lists_column_view const &input, std::vector<cudf::data_t
                   rmm::cuda_stream_view stream = rmm::cuda_stream_default,
                   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-} // namespace java
+} // namespace jni
 } // namespace cudf

From 4c750a9ef0e82442df5d0492df6891bd6ea25002 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 10 Jan 2022 04:24:33 +0000
Subject: [PATCH 79/80] Removing magic numbers per review comments

---
 java/src/main/native/src/row_conversion.cu | 74 +++++++++++++---------
 1 file changed, 43 insertions(+), 31 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index e777099acb3..94b9e4bc143 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -587,20 +587,23 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
         tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
 
     // copy entire row 8 bytes at a time
-    auto const chunks_per_row = util::div_rounding_up_unsafe(tile_row_size, 8);
+    constexpr auto bytes_per_chunk = 8;
+    auto const chunks_per_row = util::div_rounding_up_unsafe(tile_row_size, bytes_per_chunk);
     auto const total_chunks = chunks_per_row * tile.num_rows();
 
     for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
       // determine source address of my chunk
       auto const relative_row = i / chunks_per_row;
-      auto const relative_chunk_offset = (i % chunks_per_row) * 8;
+      auto const relative_chunk_offset = (i % chunks_per_row) * bytes_per_chunk;
       auto const output_dest = tile_output_buffer +
                                row_offsets(relative_row + tile.start_row, row_batch_start) +
                                column_offset + relative_chunk_offset;
       auto const input_src = &shared[processing_index % stages_count]
                                     [tile_row_size * relative_row + relative_chunk_offset];
 
-      cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier);
+      cuda::memcpy_async(output_dest, input_src,
+                         cuda::aligned_size_t<bytes_per_chunk>(bytes_per_chunk),
+                         processing_barrier);
     }
   }
 
@@ -670,8 +673,8 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
 
     auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, 32);
     auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, 32);
-    auto const validity_data_row_length =
-        util::round_up_unsafe(util::div_rounding_up_unsafe(num_tile_cols, 8), JCUDF_ROW_ALIGNMENT);
+    auto const validity_data_row_length = util::round_up_unsafe(
+        util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
     auto const total_sections = num_sections_x * num_sections_y;
 
     int const warp_id = threadIdx.x / warp_size;
@@ -703,7 +706,7 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
           auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
           // lead thread in each warp writes data
           auto const validity_write_offset =
-              validity_data_row_length * (relative_row + i) + relative_col / 8;
+              validity_data_row_length * (relative_row + i) + relative_col / CHAR_BIT;
           if (threadIdx.x % warp_size == 0) {
             *reinterpret_cast<int32_t *>(&this_shared_tile[validity_write_offset]) = validity_data;
           }
@@ -715,16 +718,17 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
     group.sync();
 
     auto const output_data_base =
-        output_data[tile.batch_number] + validity_offset + tile.start_col / 8;
+        output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT;
 
     // now async memcpy the shared memory out to the final destination 4 bytes at a time since we do
     // 32-row chunks
-    auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, 8);
-    auto const chunks_per_row = util::div_rounding_up_unsafe(row_bytes, 8);
+    constexpr auto bytes_per_chunk = 8;
+    auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT);
+    auto const chunks_per_row = util::div_rounding_up_unsafe(row_bytes, bytes_per_chunk);
     auto const total_chunks = chunks_per_row * tile.num_rows();
     auto &processing_barrier =
         shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
-    auto const tail_bytes = row_bytes % 8;
+    auto const tail_bytes = row_bytes % bytes_per_chunk;
     auto const row_batch_start =
         tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
 
@@ -732,7 +736,7 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
       // determine source address of my chunk
       auto const relative_row = i / chunks_per_row;
       auto const col_chunk = i % chunks_per_row;
-      auto const relative_chunk_offset = col_chunk * 8;
+      auto const relative_chunk_offset = col_chunk * bytes_per_chunk;
       auto const output_dest = output_data_base +
                                row_offsets(relative_row + tile.start_row, row_batch_start) +
                                relative_chunk_offset;
@@ -742,7 +746,9 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
       if (tail_bytes > 0 && col_chunk == chunks_per_row - 1)
         cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier);
       else
-        cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier);
+        cuda::memcpy_async(output_dest, input_src,
+                           cuda::aligned_size_t<bytes_per_chunk>(bytes_per_chunk),
+                           processing_barrier);
     }
   }
 
@@ -936,8 +942,9 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
     auto const tile_start_row = tile.start_row;
     auto const num_tile_cols = tile.num_cols();
     auto const num_tile_rows = tile.num_rows();
-    auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, 8);
-    auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, 32);
+    constexpr auto rows_per_read = 32;
+    auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, CHAR_BIT);
+    auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read);
     auto const validity_data_col_length = num_sections_y * 4; // words to bytes
     auto const total_sections = num_sections_x * num_sections_y;
     int const warp_id = threadIdx.x / warp_size;
@@ -950,8 +957,8 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
       // convert section to row and col
       auto const section_x = my_section_idx % num_sections_x;
       auto const section_y = my_section_idx / num_sections_x;
-      auto const relative_col = section_x * 8;
-      auto const relative_row = section_y * 32 + lane_id;
+      auto const relative_col = section_x * CHAR_BIT;
+      auto const relative_row = section_y * rows_per_read + lane_id;
       auto const absolute_col = relative_col + tile_start_col;
       auto const absolute_row = relative_row + tile_start_row;
       auto const row_batch_start =
@@ -961,18 +968,18 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
 
       if (absolute_row < num_rows) {
         auto const my_byte = input_data[row_offsets(absolute_row, row_batch_start) +
-                                        validity_offset + absolute_col / 8];
+                                        validity_offset + absolute_col / CHAR_BIT];
 
         // so every thread that is participating in the warp has a byte, but it's row-based
         // data and we need it in column-based. So we shuffle the bits around to make
         // the bytes we actually write.
-        for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns;
+        for (int i = 0, byte_mask = 1; i < CHAR_BIT && relative_col + i < num_columns;
              ++i, byte_mask <<= 1) {
           auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
           // lead thread in each warp writes data
           if (threadIdx.x % warp_size == 0) {
             auto const validity_write_offset =
-                validity_data_col_length * (relative_col + i) + relative_row / 8;
+                validity_data_col_length * (relative_col + i) + relative_row / CHAR_BIT;
 
             *reinterpret_cast<int32_t *>(&this_shared_tile[validity_write_offset]) = validity_data;
           }
@@ -984,19 +991,20 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
     group.sync();
 
     // now async memcpy the shared memory out to the final destination 8 bytes at a time
-    auto const col_bytes = util::div_rounding_up_unsafe(num_tile_rows, 8);
-    auto const chunks_per_col = util::div_rounding_up_unsafe(col_bytes, 8);
+    constexpr auto bytes_per_chunk = 8;
+    auto const col_bytes = util::div_rounding_up_unsafe(num_tile_rows, CHAR_BIT);
+    auto const chunks_per_col = util::div_rounding_up_unsafe(col_bytes, bytes_per_chunk);
     auto const total_chunks = chunks_per_col * num_tile_cols;
     auto &processing_barrier =
         shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
-    auto const tail_bytes = col_bytes % 8;
+    auto const tail_bytes = col_bytes % bytes_per_chunk;
 
     for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
       // determine source address of my chunk
       auto const relative_col = i / chunks_per_col;
       auto const row_chunk = i % chunks_per_col;
       auto const absolute_col = relative_col + tile_start_col;
-      auto const relative_chunk_byte_offset = row_chunk * 8;
+      auto const relative_chunk_byte_offset = row_chunk * bytes_per_chunk;
       auto const output_dest = output_nm[absolute_col] + word_index(tile_start_row) + row_chunk * 2;
       auto const input_src =
           &this_shared_tile[validity_data_col_length * relative_col + relative_chunk_byte_offset];
@@ -1004,7 +1012,9 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
       if (tail_bytes > 0 && row_chunk == chunks_per_col - 1) {
         cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier);
       } else {
-        cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier);
+        cuda::memcpy_async(output_dest, input_src,
+                           cuda::aligned_size_t<bytes_per_chunk>(bytes_per_chunk),
+                           processing_barrier);
       }
     }
   }
@@ -1144,7 +1154,8 @@ static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &s
   // Now we need to add in space for validity
   // Eventually we can think about nullable vs not nullable, but for now we will just always add
   // it in
-  int32_t const validity_bytes_needed = util::div_rounding_up_safe<int32_t>(schema.size(), 8);
+  int32_t const validity_bytes_needed =
+      util::div_rounding_up_safe<int32_t>(schema.size(), CHAR_BIT);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1189,7 +1200,7 @@ static size_type compute_column_information(iterator begin, iterator end,
 
   return util::round_up_unsafe(
       fixed_width_size_per_row +
-          util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), 8),
+          util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), CHAR_BIT),
       JCUDF_ROW_ALIGNMENT);
 }
 
@@ -1211,9 +1222,9 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row
       [&]() {
         if (desired_rows_and_columns > num_columns) {
           // not many columns, group it into 8s and ship it off
-          return std::min(8, num_columns);
+          return std::min(CHAR_BIT, num_columns);
         } else {
-          return util::round_down_safe(desired_rows_and_columns, 8);
+          return util::round_down_safe(desired_rows_and_columns, CHAR_BIT);
         }
       }(),
       JCUDF_ROW_ALIGNMENT);
@@ -1221,8 +1232,8 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row
   // we fit as much as we can given the column stride
   // note that an element in the table takes just 1 bit, but a row with a single
   // element still takes 8 bytes!
-  auto const bytes_per_row =
-      util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), JCUDF_ROW_ALIGNMENT);
+  auto const bytes_per_row = util::round_up_safe(
+      util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
   auto const row_stride =
       std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64));
 
@@ -1250,7 +1261,8 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row
 }
 
 /**
- * @brief functor that returns the size of a row or 0 is row is greater than the number of rows in the table
+ * @brief functor that returns the size of a row or 0 is row is greater than the number of rows in
+ * the table
  *
  * @tparam RowSize iterator that returns the size of a specific row
  */

From 0d0015afcc1476b8e1b55d55cb525511b35df611 Mon Sep 17 00:00:00 2001
From: Mike Wilson <knobby@burntsheep.com>
Date: Mon, 10 Jan 2022 04:26:40 +0000
Subject: [PATCH 80/80] removing magic number 2

---
 java/src/main/native/src/row_conversion.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 94b9e4bc143..3ef092792bf 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1804,10 +1804,11 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
 
   // only ever get a single batch when going from rows, so boundaries
   // are 0, num_rows
-  device_uvector<size_type> gpu_batch_row_boundaries(2, stream);
+  constexpr auto num_batches = 2;
+  device_uvector<size_type> gpu_batch_row_boundaries(num_batches, stream);
 
   thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
-                    thrust::make_counting_iterator(2), gpu_batch_row_boundaries.begin(),
+                    thrust::make_counting_iterator(num_batches), gpu_batch_row_boundaries.begin(),
                     [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; });
 
   int info_count = 0;