From 9c72e56837ddfb3fb9b3d1111cdd08e1f53595c4 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 05:08:16 -0500
Subject: [PATCH 01/32] simplify io/functions.cpp data source/sink factories

---
 cpp/src/io/functions.cpp | 126 +++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 59 deletions(-)
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index bf51012211c..e080ea3a2ca 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -106,67 +106,56 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 }
 
 namespace {
-template <typename reader, typename reader_options>
-std::unique_ptr<reader> make_reader(source_info const& src_info,
-                                    reader_options const& options,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  if (src_info.type == io_type::FILEPATH) {
-    return std::make_unique<reader>(src_info.filepaths, options, stream, mr);
-  }
 
-  std::vector<std::unique_ptr<datasource>> datasources;
-  if (src_info.type == io_type::HOST_BUFFER) {
-    datasources = cudf::io::datasource::create(src_info.buffers);
-  } else if (src_info.type == io_type::USER_IMPLEMENTED) {
-    datasources = cudf::io::datasource::create(src_info.user_sources);
-  } else {
-    CUDF_FAIL("Unsupported source type");
+std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info)
+{
+  switch (info.type) {
+    case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths);
+    case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers);
+    case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources);
+    default: CUDF_FAIL("Unsupported source type");
   }
-
-  return std::make_unique<reader>(std::move(datasources), options, stream, mr);
 }
 
-template <typename writer, typename... Ts>
-std::unique_ptr<writer> make_writer(sink_info const& sink, Ts&&... args)
+std::unique_ptr<data_sink> make_datasink(sink_info const& info)
 {
-  if (sink.type == io_type::FILEPATH) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.filepath),
-                                    std::forward<Ts>(args)...);
-  }
-  if (sink.type == io_type::HOST_BUFFER) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.buffer),
-                                    std::forward<Ts>(args)...);
+  switch (info.type) {
+    case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath);
+    case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer);
+    case io_type::VOID: return cudf::io::data_sink::create();
+    case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink);
+    default: CUDF_FAIL("Unsupported sink type");
   }
-  if (sink.type == io_type::VOID) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(), std::forward<Ts>(args)...);
-  }
-  if (sink.type == io_type::USER_IMPLEMENTED) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.user_sink),
-                                    std::forward<Ts>(args)...);
-  }
-  CUDF_FAIL("Unsupported sink type");
 }
 
 }  // namespace
 
-table_with_metadata read_avro(avro_reader_options const& opts, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_avro(avro_reader_options const& options,
+                              rmm::mr::device_memory_resource* mr)
 {
   namespace avro = cudf::io::detail::avro;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<avro::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
-  return reader->read(opts);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader =
+    std::make_unique<avro::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+
+  return reader->read(options);
 }
 
-table_with_metadata read_json(json_reader_options const& opts, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_json(json_reader_options const& options,
+                              rmm::mr::device_memory_resource* mr)
 {
   namespace json = cudf::io::detail::json;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<json::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
-  return reader->read(opts);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader =
+    std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+
+  return reader->read(options);
 }
 
 table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr)
@@ -174,8 +163,10 @@ table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_
   namespace csv = cudf::io::detail::csv;
 
   CUDF_FUNC_RANGE();
+
+  auto datasources = make_datasources(options.get_source());
   auto reader =
-    make_reader<csv::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
+    std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read();
 }
@@ -185,7 +176,9 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
 {
   using namespace cudf::io::detail;
 
-  auto writer = make_writer<csv::writer>(options.get_sink(), options, rmm::cuda_stream_default, mr);
+  auto sink = make_datasink(options.get_sink());
+  auto writer =
+    std::make_unique<csv::writer>(std::move(sink), options, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table(), options.get_metadata());
 }
@@ -294,8 +287,10 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info)
 table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader =
-    make_reader<detail_orc::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader      = std::make_unique<detail_orc::reader>(
+    std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -305,11 +300,13 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_
  */
 void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resource* mr)
 {
+  namespace io_detail = cudf::io::detail;
+
   CUDF_FUNC_RANGE();
 
-  namespace io_detail = cudf::io::detail;
-  auto writer         = make_writer<detail_orc::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+  auto sink   = make_datasink(options.get_sink());
+  auto writer = std::make_unique<detail_orc::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
 }
@@ -317,12 +314,15 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
-orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& op,
+orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options,
                                        rmm::mr::device_memory_resource* mr)
 {
   namespace io_detail = cudf::io::detail;
-  writer              = make_writer<detail_orc::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+
+  auto sink = make_datasink(options.get_sink());
+
+  writer = std::make_unique<detail_orc::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -354,8 +354,10 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<detail_parquet::reader>(
-    options.get_source(), options, rmm::cuda_stream_default, mr);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader      = std::make_unique<detail_parquet::reader>(
+    std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -392,25 +394,31 @@ table_input_metadata::table_input_metadata(table_view const& table,
 std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const& options,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
   namespace io_detail = cudf::io::detail;
 
-  auto writer = make_writer<detail_parquet::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+  CUDF_FUNC_RANGE();
+
+  auto sink   = make_datasink(options.get_sink());
+  auto writer = std::make_unique<detail_parquet::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
+
   return writer->close(options.get_column_chunks_file_path());
 }
 
 /**
  * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer
  */
-parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& op,
+parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options,
                                                rmm::mr::device_memory_resource* mr)
 {
   namespace io_detail = cudf::io::detail;
-  writer              = make_writer<detail_parquet::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+
+  auto sink = make_datasink(options.get_sink());
+
+  writer = std::make_unique<detail_parquet::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**

From 88e23990151c737dcb4a22a5d6454ef8893285c4 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Tue, 17 Aug 2021 00:53:48 -0500
Subject: [PATCH 02/32] remove filepath-related logic from csv and json readers

---
 cpp/include/cudf/io/csv.hpp            |  2 +-
 cpp/include/cudf/io/json.hpp           |  2 +-
 cpp/src/io/comp/io_uncomp.h            |  7 +++--
 cpp/src/io/comp/uncomp.cpp             | 19 ++++++------
 cpp/src/io/csv/reader_impl.cu          | 38 ++++++------------------
 cpp/src/io/csv/reader_impl.hpp         |  4 ---
 cpp/src/io/functions.cpp               | 40 ++++++++++++++++++++++++--
 cpp/src/io/json/reader_impl.cu         | 31 ++------------------
 cpp/src/io/json/reader_impl.hpp        |  1 -
 cpp/src/io/utilities/parsing_utils.cu  | 34 ----------------------
 cpp/src/io/utilities/parsing_utils.cuh | 18 ------------
 python/cudf/cudf/_lib/csv.pyx          |  2 +-
 python/cudf/cudf/tests/test_csv.py     | 14 ---------
 13 files changed, 66 insertions(+), 146 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index d4a21b2e98c..c807f189aac 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1199,7 +1199,7 @@ class csv_reader_options_builder {
  * @return The set of columns along with metadata.
  */
 table_with_metadata read_csv(
-  csv_reader_options const& options,
+  csv_reader_options options,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 8954f7dcab1..bca60f76260 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -364,7 +364,7 @@ class json_reader_options_builder {
  * @return The set of columns along with metadata.
  */
 table_with_metadata read_json(
-  json_reader_options const& options,
+  json_reader_options options,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/io/comp/io_uncomp.h b/cpp/src/io/comp/io_uncomp.h
index 8daf73ecd0c..7b1feb84813 100644
--- a/cpp/src/io/comp/io_uncomp.h
+++ b/cpp/src/io/comp/io_uncomp.h
@@ -16,12 +16,13 @@
 
 #pragma once
 
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/span.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
 
-#include <cudf/utilities/span.hpp>
-
 using cudf::host_span;
 
 namespace cudf {
@@ -42,7 +43,7 @@ enum {
 
 std::vector<char> io_uncompress_single_h2d(void const* src, size_t src_size, int stream_type);
 
-std::vector<char> get_uncompressed_data(host_span<char const> data, std::string const& compression);
+std::vector<char> get_uncompressed_data(host_span<char const> data, compression_type compression);
 
 class HostDecompressor {
  public:
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 2cb99d897fe..ee451d04dbb 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -369,6 +369,7 @@ std::vector<char> io_uncompress_single_h2d(const void* src, size_t src_size, int
       // Unsupported format
       break;
   }
+
   CUDF_EXPECTS(comp_data != nullptr, "Unsupported compressed stream type");
   CUDF_EXPECTS(comp_len > 0, "Unsupported compressed stream type");
 
@@ -422,17 +423,17 @@ std::vector<char> io_uncompress_single_h2d(const void* src, size_t src_size, int
  * @return Vector containing the output uncompressed data
  */
 std::vector<char> get_uncompressed_data(host_span<char const> const data,
-                                        std::string const& compression)
+                                        compression_type compression)
 {
   int comp_type = IO_UNCOMP_STREAM_TYPE_INFER;
-  if (compression == "gzip")
-    comp_type = IO_UNCOMP_STREAM_TYPE_GZIP;
-  else if (compression == "zip")
-    comp_type = IO_UNCOMP_STREAM_TYPE_ZIP;
-  else if (compression == "bz2")
-    comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2;
-  else if (compression == "xz")
-    comp_type = IO_UNCOMP_STREAM_TYPE_XZ;
+
+  switch (compression) {
+    case compression_type::GZIP: comp_type = IO_UNCOMP_STREAM_TYPE_GZIP; break;
+    case compression_type::ZIP: comp_type = IO_UNCOMP_STREAM_TYPE_ZIP; break;
+    case compression_type::BZIP2: comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2; break;
+    case compression_type::XZ: comp_type = IO_UNCOMP_STREAM_TYPE_XZ; break;
+    default: break;
+  }
 
   return io_uncompress_single_h2d(data.data(), data.size(), comp_type);
 }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 549b0474fe1..a85a610962e 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -206,10 +206,12 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
   auto num_rows      = opts_.get_nrows();
 
   if (range_offset > 0 || range_size > 0) {
-    CUDF_EXPECTS(compression_type_ == "none",
+    CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE,
                  "Reading compressed data using `byte range` is unsupported");
   }
+
   size_t map_range_size = 0;
+
   if (range_size != 0) {
     auto num_given_dtypes =
       std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes());
@@ -217,12 +219,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
     map_range_size         = range_size + calculateMaxRowSize(num_columns);
   }
 
-  // Support delayed opening of the file if using memory mapping datasource
-  // This allows only mapping of a subset of the file if using byte range
-  if (source_ == nullptr) {
-    assert(!filepath_.empty());
-    source_ = datasource::create(filepath_, range_offset, map_range_size);
-  }
+  // TODO: provide hint to datasource that we should memory map any underlying file.
 
   // Transfer source data to GPU
   if (!source_->is_empty()) {
@@ -235,10 +232,11 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 
     std::vector<char> h_uncomp_data_owner;
 
-    if (compression_type_ != "none") {
-      h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type_);
+    if (opts_.get_compression() != compression_type::NONE) {
+      h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression());
       h_data              = h_uncomp_data_owner;
     }
+
     // None of the parameters for row selection is used, we are parsing the entire file
     const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
                                  skip_end_rows <= 0 && num_rows == -1;
@@ -927,35 +925,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
 }
 
 reader::impl::impl(std::unique_ptr<datasource> source,
-                   std::string filepath,
                    csv_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options)
+  : mr_(mr), source_(std::move(source)), opts_(options)
 {
   num_actual_cols_ = opts_.get_names().size();
   num_active_cols_ = num_actual_cols_;
 
-  compression_type_ =
-    infer_compression_type(opts_.get_compression(),
-                           filepath,
-                           {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
-
   opts = make_parse_options(options, stream);
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               csv_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
-  // Delay actual instantiation of data source until read to allow for
-  // partial memory mapping of file using byte ranges
-  _impl = std::make_unique<impl>(nullptr, filepaths[0], options, stream, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                csv_reader_options const& options,
@@ -963,7 +943,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(std::move(sources[0]), "", options, stream, mr);
+  _impl = std::make_unique<impl>(std::move(sources[0]), options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 36c2bf4f9e7..beaa9b816cb 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -72,13 +72,11 @@ class reader::impl {
    * @brief Constructor from a dataset source with reader options.
    *
    * @param source Dataset source
-   * @param filepath Filepath if reading dataset from a file
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::unique_ptr<datasource> source,
-                std::string filepath,
                 csv_reader_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
@@ -222,8 +220,6 @@ class reader::impl {
  private:
   rmm::mr::device_memory_resource* mr_ = nullptr;
   std::unique_ptr<datasource> source_;
-  std::string filepath_;
-  std::string compression_type_;
   const csv_reader_options opts_;
 
   cudf::size_type num_records_ = 0;  // Number of rows with actual data
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index e080ea3a2ca..ccc2eef56c7 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -144,27 +144,61 @@ table_with_metadata read_avro(avro_reader_options const& options,
   return reader->read(options);
 }
 
-table_with_metadata read_json(json_reader_options const& options,
-                              rmm::mr::device_memory_resource* mr)
+compression_type infer_compression_type(compression_type compression, source_info const& info)
+{
+  if (compression != compression_type::AUTO) { return compression; }
+
+  if (info.type != io_type::FILEPATH) { return compression_type::NONE; }
+
+  auto filepath = info.filepaths[0];
+
+  // Attempt to infer from the file extension
+  const auto pos = filepath.find_last_of('.');
+
+  if (pos == std::string::npos) { return {}; }
+
+  auto str_tolower = [](const auto& begin, const auto& end) {
+    std::string out;
+    std::transform(begin, end, std::back_inserter(out), ::tolower);
+    return out;
+  };
+
+  const auto ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
+
+  if (ext == "gz") { return compression_type::GZIP; }
+  if (ext == "zip") { return compression_type::ZIP; }
+  if (ext == "bz2") { return compression_type::BZIP2; }
+  if (ext == "xz") { return compression_type::XZ; }
+
+  return compression_type::NONE;
+}
+
+table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr)
 {
   namespace json = cudf::io::detail::json;
 
   CUDF_FUNC_RANGE();
 
   auto datasources = make_datasources(options.get_source());
+
+  options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
+
   auto reader =
     std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
 
-table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
 {
   namespace csv = cudf::io::detail::csv;
 
   CUDF_FUNC_RANGE();
 
   auto datasources = make_datasources(options.get_source());
+
+  options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
+
   auto reader =
     std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index a8f117c22bf..bae7471e307 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -241,15 +241,6 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
     map_range_size = range_size + calculate_max_row_size(dtype_option_size);
   }
 
-  // Support delayed opening of the file if using memory mapping datasource
-  // This allows only mapping of a subset of the file if using byte range
-  if (sources_.empty()) {
-    assert(!filepaths_.empty());
-    for (const auto& path : filepaths_) {
-      sources_.emplace_back(datasource::create(path, range_offset, map_range_size));
-    }
-  }
-
   // Iterate through the user defined sources and read the contents into the local buffer
   CUDF_EXPECTS(!sources_.empty(), "No sources were defined");
   size_t total_source_size = 0;
@@ -280,11 +271,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
  */
 void reader::impl::decompress_input(rmm::cuda_stream_view stream)
 {
-  const auto compression_type =
-    infer_compression_type(options_.get_compression(),
-                           filepaths_.size() > 0 ? filepaths_[0] : "",
-                           {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
-  if (compression_type == "none") {
+  if (options_.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
     uncomp_data_ = reinterpret_cast<const char*>(buffer_.data());
     uncomp_size_ = buffer_.size();
@@ -293,7 +280,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream)
       host_span<char const>(                     //
         reinterpret_cast<const char*>(buffer_.data()),
         buffer_.size()),
-      compression_type);
+      options_.get_compression());
 
     uncomp_data_ = uncomp_data_owner_.data();
     uncomp_size_ = uncomp_data_owner_.size();
@@ -665,7 +652,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    json_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : options_(options), mr_(mr), sources_(std::move(sources)), filepaths_(filepaths)
+  : options_(options), mr_(mr), sources_(std::move(sources))
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
@@ -713,18 +700,6 @@ table_with_metadata reader::impl::read(json_reader_options const& options,
   return convert_data_to_table(rec_starts, stream);
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               json_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  // Delay actual instantiation of data source until read to allow for
-  // partial memory mapping of file using byte ranges
-  std::vector<std::unique_ptr<datasource>> src = {};  // Empty datasources
-  _impl = std::make_unique<impl>(std::move(src), filepaths, options, stream, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                json_reader_options const& options,
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 5cf51369cdf..f7af55b2b90 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -57,7 +57,6 @@ class reader::impl {
   rmm::mr::device_memory_resource* mr_ = nullptr;
 
   std::vector<std::unique_ptr<datasource>> sources_;
-  std::vector<std::string> filepaths_;
   std::vector<uint8_t> buffer_;
 
   const char* uncomp_data_ = nullptr;
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 6c8f01111e5..ba62238c5d3 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -209,39 +209,5 @@ cudf::size_type count_all_from_set(const char* h_data,
   return find_all_from_set<void>(h_data, h_size, keys, 0, nullptr, stream);
 }
 
-std::string infer_compression_type(
-  const compression_type& compression_arg,
-  const std::string& filename,
-  const std::vector<std::pair<std::string, std::string>>& ext_to_comp_map)
-{
-  auto str_tolower = [](const auto& begin, const auto& end) {
-    std::string out;
-    std::transform(begin, end, std::back_inserter(out), ::tolower);
-    return out;
-  };
-
-  // Attempt to infer from user-supplied argument
-  if (compression_arg != compression_type::AUTO) {
-    switch (compression_arg) {
-      case compression_type::GZIP: return "gzip";
-      case compression_type::BZIP2: return "bz2";
-      case compression_type::ZIP: return "zip";
-      case compression_type::XZ: return "xz";
-      default: break;
-    }
-  }
-
-  // Attempt to infer from the file extension
-  const auto pos = filename.find_last_of('.');
-  if (pos != std::string::npos) {
-    const auto ext = str_tolower(filename.begin() + pos + 1, filename.end());
-    for (const auto& mapping : ext_to_comp_map) {
-      if (mapping.first == ext) { return mapping.second; }
-    }
-  }
-
-  return "none";
-}
-
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 88297423b9b..daf23de7eb2 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -454,24 +454,6 @@ cudf::size_type count_all_from_set(const char* h_data,
                                    const std::vector<char>& keys,
                                    rmm::cuda_stream_view stream);
 
-/**
- * @brief Infer file compression type based on user supplied arguments.
- *
- * If the user specifies a valid compression_type for compression arg,
- * compression type will be computed based on that.  Otherwise the filename
- * and ext_to_comp_map will be used.
- *
- * @param[in] compression_arg User specified compression type (if any)
- * @param[in] filename Filename to base compression type (by extension) on
- * @param[in] ext_to_comp_map User supplied mapping of file extension to compression type
- *
- * @return string representing compression type ("gzip, "bz2", etc)
- */
-std::string infer_compression_type(
-  const compression_type& compression_arg,
-  const std::string& filename,
-  const std::vector<std::pair<std::string, std::string>>& ext_to_comp_map);
-
 /**
  * @brief Checks whether the given character is a whitespace character.
  *
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index a15a180d466..7a54ccac197 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -101,7 +101,7 @@ cdef csv_reader_options make_csv_reader_options(
     bool na_filter,
     object prefix,
     object index_col,
-) except +:
+) except *:
     cdef source_info c_source_info = make_source_info([datasource])
     cdef compression_type c_compression
     cdef size_type c_header
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 5511a65d0a4..8fb5d7cc9eb 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1069,20 +1069,6 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes):
     assert list(df["int2"]) == list(ref_df["int2"])
 
 
-def test_csv_reader_byte_range_type_corner_case(tmpdir):
-    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv")
-
-    cudf.datasets.timeseries(
-        start="2000-01-01",
-        end="2000-01-02",
-        dtypes={"name": str, "id": int, "x": float, "y": float},
-    ).to_csv(fname, chunksize=100000)
-
-    byte_range = (2_147_483_648, 0)
-    with pytest.raises(RuntimeError, match="Offset is past end of file"):
-        cudf.read_csv(fname, byte_range=byte_range, header=None)
-
-
 @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36])
 def test_csv_reader_byte_range_strings(segment_bytes):
     names = ["strings"]

From 62b95202d9b1db14f765ef45644d9cf91f782ea7 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Tue, 17 Aug 2021 02:19:27 -0500
Subject: [PATCH 03/32] remove filepath logic from avro, parquet, orc readers

---
 cpp/include/cudf/io/detail/avro.hpp    | 13 -------------
 cpp/include/cudf/io/detail/orc.hpp     | 13 -------------
 cpp/include/cudf/io/detail/parquet.hpp | 13 -------------
 cpp/src/io/avro/reader_impl.cu         | 10 ----------
 cpp/src/io/orc/reader_impl.cu          |  9 ---------
 cpp/src/io/parquet/reader_impl.cu      |  9 ---------
 6 files changed, 67 deletions(-)

diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index 98483d1c03e..306c15dcb72 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -38,19 +38,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  avro_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index ab26c01db74..2174b688da2 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -47,19 +47,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  orc_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index d95af7a11da..14f27ef8eef 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -49,19 +49,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  parquet_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index f6ffdd99d35..08ea96139a1 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -474,16 +474,6 @@ table_with_metadata reader::impl::read(avro_reader_options const& options,
   return {std::make_unique<table>(std::move(out_columns)), std::move(metadata_out)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               avro_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(datasource::create(filepaths[0]), options, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                avro_reader_options const& options,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 033a2d9aff5..5d62c45df83 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1383,15 +1383,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               orc_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  _impl = std::make_unique<impl>(datasource::create(filepaths), options, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 9f9bdfd4755..31ae763d9ff 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1608,15 +1608,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               parquet_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(datasource::create(filepaths), options, mr))
-{
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                parquet_reader_options const& options,

From fb0129433bdd2dd264105ba172d96f2a310d8d8d Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 18 Aug 2021 15:19:11 -0500
Subject: [PATCH 04/32] move range size padding calculation out of json/csv
 reader and in to json/csv options

---
 cpp/include/cudf/io/csv.hpp        | 34 +++++++++++++++++++++
 cpp/include/cudf/io/json.hpp       | 32 +++++++++++++++++++
 cpp/src/io/csv/reader_impl.cu      | 49 +++++-------------------------
 cpp/src/io/functions.cpp           | 24 +++++++++++----
 cpp/src/io/json/reader_impl.cu     | 46 ++++++----------------------
 cpp/src/io/json/reader_impl.hpp    |  3 +-
 python/cudf/cudf/tests/test_csv.py | 14 +++++++++
 7 files changed, 116 insertions(+), 86 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index c807f189aac..1aa6e3bea29 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -177,6 +177,40 @@ class csv_reader_options {
    */
   std::size_t get_byte_range_size() const { return _byte_range_size; }
 
+  /**
+   * @brief Returns number of bytes to read with padding.
+   */
+  std::size_t get_byte_range_size_with_padding() const
+  {
+    if (_byte_range_size == 0) {
+      return 0;
+    } else {
+      return _byte_range_size + get_byte_range_padding();
+    }
+  }
+
+  /**
+   * @brief Returns number of bytes to pad when reading.
+   */
+  std::size_t get_byte_range_padding() const
+  {
+    auto const num_names   = _names.size();
+    auto const num_dtypes  = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+    auto const num_columns = std::max(num_dtypes, num_names);
+
+    auto const max_row_bytes = 16 * 1024;  // 16KB
+    auto const column_bytes  = 64;
+    auto const base_padding  = 1024;  // 1KB
+
+    if (num_columns == 0) {
+      // Use flat size if the number of columns is not known
+      return max_row_bytes;
+    }
+
+    // Expand the size based on the number of columns, if available
+    return base_padding + num_columns * column_bytes;
+  }
+
   /**
    * @brief Returns names of the columns.
    */
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index bca60f76260..5d2a4f6fcd1 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -140,6 +140,38 @@ class json_reader_options {
    */
   size_t get_byte_range_size() const { return _byte_range_size; }
 
+  /**
+   * @brief Returns number of bytes to read with padding.
+   */
+  size_t get_byte_range_size_with_padding() const
+  {
+    if (_byte_range_size == 0) {
+      return 0;
+    } else {
+      return _byte_range_size + get_byte_range_padding();
+    }
+  }
+
+  /**
+   * @brief Returns number of bytes to pad when reading.
+   */
+  size_t get_byte_range_padding() const
+  {
+    auto const num_columns = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+
+    auto const max_row_bytes = 16 * 1024;  // 16KB
+    auto const column_bytes  = 64;
+    auto const base_padding  = 1024;  // 1KB
+
+    if (num_columns == 0) {
+      // Use flat size if the number of columns is not known
+      return max_row_bytes;
+    }
+
+    // Expand the size based on the number of columns, if available
+    return base_padding + num_columns * column_bytes;
+  }
+
   /**
    * @brief Whether to read the file as a json object per line.
    */
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index a85a610962e..c61cc26800e 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -57,31 +57,6 @@ namespace csv {
 using namespace cudf::io::csv;
 using namespace cudf::io;
 
-/**
- * @brief Estimates the maximum expected length or a row, based on the number
- * of columns
- *
- * If the number of columns is not available, it will return a value large
- * enough for most use cases
- *
- * @param[in] num_columns Number of columns in the CSV file (optional)
- *
- * @return Estimated maximum size of a row, in bytes
- */
-constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept
-{
-  constexpr size_t max_row_bytes = 16 * 1024;  // 16KB
-  constexpr size_t column_bytes  = 64;
-  constexpr size_t base_padding  = 1024;  // 1KB
-  if (num_columns == 0) {
-    // Use flat size if the number of columns is not known
-    return max_row_bytes;
-  } else {
-    // Expand the size based on the number of columns, if available
-    return base_padding + num_columns * column_bytes;
-  }
-}
-
 /**
  * @brief Translates a dtype string and returns its dtype enumeration and any
  * extended dtype flags that are supported by cuIO. Often, this is a column
@@ -199,31 +174,21 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream)
 std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
 reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 {
-  auto range_offset  = opts_.get_byte_range_offset();
-  auto range_size    = opts_.get_byte_range_size();
-  auto skip_rows     = opts_.get_skiprows();
-  auto skip_end_rows = opts_.get_skipfooter();
-  auto num_rows      = opts_.get_nrows();
+  auto range_offset      = opts_.get_byte_range_offset();
+  auto range_size        = opts_.get_byte_range_size();
+  auto range_size_padded = opts_.get_byte_range_size_with_padding();
+  auto skip_rows         = opts_.get_skiprows();
+  auto skip_end_rows     = opts_.get_skipfooter();
+  auto num_rows          = opts_.get_nrows();
 
   if (range_offset > 0 || range_size > 0) {
     CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE,
                  "Reading compressed data using `byte range` is unsupported");
   }
 
-  size_t map_range_size = 0;
-
-  if (range_size != 0) {
-    auto num_given_dtypes =
-      std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes());
-    const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes);
-    map_range_size         = range_size + calculateMaxRowSize(num_columns);
-  }
-
-  // TODO: provide hint to datasource that we should memory map any underlying file.
-
   // Transfer source data to GPU
   if (!source_->is_empty()) {
-    auto data_size = (map_range_size != 0) ? map_range_size : source_->size();
+    auto data_size = (range_size_padded != 0) ? range_size_padded : source_->size();
     auto buffer    = source_->host_read(range_offset, data_size);
 
     auto h_data = host_span<char const>(  //
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index ccc2eef56c7..438cb1762c6 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -107,10 +107,18 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 
 namespace {
 
-std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info)
+std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
+                                                                    size_t range_offset = 0,
+                                                                    size_t range_size   = 0)
 {
   switch (info.type) {
-    case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths);
+    case io_type::FILEPATH: {
+      auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
+      for (auto const& filepath : info.filepaths) {
+        sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
+      }
+      return sources;
+    }
     case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers);
     case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources);
     default: CUDF_FAIL("Unsupported source type");
@@ -179,10 +187,12 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
 
   CUDF_FUNC_RANGE();
 
-  auto datasources = make_datasources(options.get_source());
-
   options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
 
+  auto datasources = make_datasources(options.get_source(),
+                                      options.get_byte_range_offset(),
+                                      options.get_byte_range_size_with_padding());
+
   auto reader =
     std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
@@ -195,10 +205,12 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
 
   CUDF_FUNC_RANGE();
 
-  auto datasources = make_datasources(options.get_source());
-
   options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
 
+  auto datasources = make_datasources(options.get_source(),
+                                      options.get_byte_range_offset(),
+                                      options.get_byte_range_size_with_padding());
+
   auto reader =
     std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index bae7471e307..0618f02e98f 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -50,31 +50,6 @@ namespace detail {
 namespace json {
 using namespace cudf::io;
 
-namespace {
-/**
- * @brief Estimates the maximum expected length or a row, based on the number
- * of columns
- *
- * If the number of columns is not available, it will return a value large
- * enough for most use cases
- *
- * @param[in] num_columns Number of columns in the JSON file (optional)
- *
- * @return Estimated maximum size of a row, in bytes
- */
-constexpr size_t calculate_max_row_size(int num_columns = 0) noexcept
-{
-  constexpr size_t max_row_bytes = 16 * 1024;  // 16KB
-  constexpr size_t column_bytes  = 64;
-  constexpr size_t base_padding  = 1024;  // 1KB
-  return num_columns == 0
-           ? max_row_bytes  // Use flat size if the # of columns is not known
-           : base_padding +
-               num_columns * column_bytes;  // Expand size based on the # of columns, if available
-}
-
-}  // anonymous namespace
-
 /**
  * @brief Aggregate the table containing keys info by their hash values.
  *
@@ -231,16 +206,12 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_obj
  *
  * @param[in] range_offset Number of bytes offset from the start
  * @param[in] range_size Bytes to read; use `0` for all remaining data
+ * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
  */
-void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
+void reader::impl::ingest_raw_input(size_t range_offset,
+                                    size_t range_size,
+                                    size_t range_size_padded)
 {
-  size_t map_range_size = 0;
-  if (range_size != 0) {
-    auto const dtype_option_size =
-      std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes());
-    map_range_size = range_size + calculate_max_row_size(dtype_option_size);
-  }
-
   // Iterate through the user defined sources and read the contents into the local buffer
   CUDF_EXPECTS(!sources_.empty(), "No sources were defined");
   size_t total_source_size = 0;
@@ -253,7 +224,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
   size_t bytes_read = 0;
   for (const auto& source : sources_) {
     if (!source->is_empty()) {
-      auto data_size = (map_range_size != 0) ? map_range_size : source->size();
+      auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
       bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]);
     }
   }
@@ -675,10 +646,11 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 table_with_metadata reader::impl::read(json_reader_options const& options,
                                        rmm::cuda_stream_view stream)
 {
-  auto range_offset = options.get_byte_range_offset();
-  auto range_size   = options.get_byte_range_size();
+  auto range_offset      = options.get_byte_range_offset();
+  auto range_size        = options.get_byte_range_size();
+  auto range_size_padded = options.get_byte_range_size_with_padding();
 
-  ingest_raw_input(range_offset, range_size);
+  ingest_raw_input(range_offset, range_size, range_size_padded);
   CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n");
 
   decompress_input(stream);
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index f7af55b2b90..d01f2e8677e 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -109,8 +109,9 @@ class reader::impl {
    *
    * @param[in] range_offset Number of bytes offset from the start
    * @param[in] range_size Bytes to read; use `0` for all remaining data
+   * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
    */
-  void ingest_raw_input(size_t range_offset, size_t range_size);
+  void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded);
 
   /**
    * @brief Extract the JSON objects keys from the input file with object rows.
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 8fb5d7cc9eb..5511a65d0a4 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1069,6 +1069,20 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes):
     assert list(df["int2"]) == list(ref_df["int2"])
 
 
+def test_csv_reader_byte_range_type_corner_case(tmpdir):
+    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv")
+
+    cudf.datasets.timeseries(
+        start="2000-01-01",
+        end="2000-01-02",
+        dtypes={"name": str, "id": int, "x": float, "y": float},
+    ).to_csv(fname, chunksize=100000)
+
+    byte_range = (2_147_483_648, 0)
+    with pytest.raises(RuntimeError, match="Offset is past end of file"):
+        cudf.read_csv(fname, byte_range=byte_range, header=None)
+
+
 @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36])
 def test_csv_reader_byte_range_strings(segment_bytes):
     names = ["strings"]

From d422aebbe62d7e9915af93f474563e6e1c571e97 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 18 Aug 2021 15:30:38 -0500
Subject: [PATCH 05/32] remove filepaths from json reader

---
 cpp/src/io/json/reader_impl.cu  | 12 +++++-------
 cpp/src/io/json/reader_impl.hpp |  3 +--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 0618f02e98f..2964a12568f 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -231,7 +231,7 @@ void reader::impl::ingest_raw_input(size_t range_offset,
 
   byte_range_offset_ = range_offset;
   byte_range_size_   = range_size;
-  load_whole_file_   = byte_range_offset_ == 0 && byte_range_size_ == 0;
+  load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0;
 }
 
 /**
@@ -256,7 +256,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream)
     uncomp_data_ = uncomp_data_owner_.data();
     uncomp_size_ = uncomp_data_owner_.size();
   }
-  if (load_whole_file_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+  if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
 }
 
 rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_view stream)
@@ -268,7 +268,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
   if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); }
   // If not starting at an offset, add an extra row to account for the first row in the file
   cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0);
-  if (load_whole_file_) {
+  if (load_whole_source_) {
     prefilter_count += count_all_from_set(data_, chars_to_count, stream);
   } else {
     prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream);
@@ -286,7 +286,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
   std::vector<char> chars_to_find{'\n'};
   if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); }
   // Passing offset = 1 to return positions AFTER the found character
-  if (load_whole_file_) {
+  if (load_whole_source_) {
     find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream);
   } else {
     find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream);
@@ -619,7 +619,6 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
 }
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   std::vector<std::string> const& filepaths,
                    json_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
@@ -678,8 +677,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
 {
-  std::vector<std::string> file_paths = {};  // Empty filepaths
-  _impl = std::make_unique<impl>(std::move(sources), file_paths, options, stream, mr);
+  _impl = std::make_unique<impl>(std::move(sources), options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index d01f2e8677e..d910cce2d72 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -68,7 +68,7 @@ class reader::impl {
 
   size_t byte_range_offset_ = 0;
   size_t byte_range_size_   = 0;
-  bool load_whole_file_     = true;
+  bool load_whole_source_   = true;
 
   table_metadata metadata_;
   std::vector<data_type> dtypes_;
@@ -186,7 +186,6 @@ class reader::impl {
    * @brief Constructor from a dataset source with reader options.
    */
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                std::vector<std::string> const& filepaths,
                 json_reader_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);

From e0cac1d39aa5143900ed0fbeb71ea4440a059252 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 17:37:50 -0500
Subject: [PATCH 06/32] replace json reader impl buffer member with local
 variable

---
 cpp/src/io/json/reader_impl.cu  | 37 +++++++++++++++------------------
 cpp/src/io/json/reader_impl.hpp |  9 +++++---
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 2964a12568f..cfda7bb11dc 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -199,16 +199,8 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_obj
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
-/**
- * @brief Ingest input JSON file/buffer, without decompression.
- *
- * Sets the sources_, byte_range_offset_, and byte_range_size_ data members
- *
- * @param[in] range_offset Number of bytes offset from the start
- * @param[in] range_size Bytes to read; use `0` for all remaining data
- * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
- */
-void reader::impl::ingest_raw_input(size_t range_offset,
+void reader::impl::ingest_raw_input(std::vector<uint8_t>& buffer,
+                                    size_t range_offset,
                                     size_t range_size,
                                     size_t range_size_padded)
 {
@@ -220,12 +212,12 @@ void reader::impl::ingest_raw_input(size_t range_offset,
   }
   total_source_size = total_source_size - range_offset;
 
-  buffer_.resize(total_source_size);
+  buffer.resize(total_source_size);
   size_t bytes_read = 0;
   for (const auto& source : sources_) {
     if (!source->is_empty()) {
       auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
-      bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]);
+      bytes_read += source->host_read(range_offset, data_size, &buffer[bytes_read]);
     }
   }
 
@@ -240,17 +232,18 @@ void reader::impl::ingest_raw_input(size_t range_offset,
  * Sets the uncomp_data_ and uncomp_size_ data members
  * Loads the data into device memory if byte range parameters are not used
  */
-void reader::impl::decompress_input(rmm::cuda_stream_view stream)
+void reader::impl::decompress_input(std::vector<uint8_t> const& buffer,
+                                    rmm::cuda_stream_view stream)
 {
   if (options_.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
-    uncomp_data_ = reinterpret_cast<const char*>(buffer_.data());
-    uncomp_size_ = buffer_.size();
+    uncomp_data_ = reinterpret_cast<const char*>(buffer.data());
+    uncomp_size_ = buffer.size();
   } else {
     uncomp_data_owner_ = get_uncompressed_data(  //
       host_span<char const>(                     //
-        reinterpret_cast<const char*>(buffer_.data()),
-        buffer_.size()),
+        reinterpret_cast<const char*>(buffer.data()),
+        buffer.size()),
       options_.get_compression());
 
     uncomp_data_ = uncomp_data_owner_.data();
@@ -649,10 +642,14 @@ table_with_metadata reader::impl::read(json_reader_options const& options,
   auto range_size        = options.get_byte_range_size();
   auto range_size_padded = options.get_byte_range_size_with_padding();
 
-  ingest_raw_input(range_offset, range_size, range_size_padded);
-  CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n");
+  std::vector<uint8_t> buffer;
+
+  ingest_raw_input(buffer, range_offset, range_size, range_size_padded);
+
+  CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n");
+
+  decompress_input(buffer, stream);
 
-  decompress_input(stream);
   CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n");
   CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n");
 
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index d910cce2d72..5e07c38a4c7 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -57,7 +57,6 @@ class reader::impl {
   rmm::mr::device_memory_resource* mr_ = nullptr;
 
   std::vector<std::unique_ptr<datasource>> sources_;
-  std::vector<uint8_t> buffer_;
 
   const char* uncomp_data_ = nullptr;
   size_t uncomp_size_      = 0;
@@ -107,11 +106,15 @@ class reader::impl {
    *
    * Sets the source_, byte_range_offset_, and byte_range_size_ data members
    *
+   * @param[in] buffer Buffer to read the bytes in to
    * @param[in] range_offset Number of bytes offset from the start
    * @param[in] range_size Bytes to read; use `0` for all remaining data
    * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
    */
-  void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded);
+  void ingest_raw_input(std::vector<uint8_t>& buffer,
+                        size_t range_offset,
+                        size_t range_size,
+                        size_t range_size_padded);
 
   /**
    * @brief Extract the JSON objects keys from the input file with object rows.
@@ -126,7 +129,7 @@ class reader::impl {
    *
    * Sets the uncomp_data_ and uncomp_size_ data members
    */
-  void decompress_input(rmm::cuda_stream_view stream);
+  void decompress_input(std::vector<uint8_t> const& buffer, rmm::cuda_stream_view stream);
 
   /**
    * @brief Finds all record starts in the file.

From dc236348b141ad8878ab07c49f9760037101a1c8 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 21:25:09 -0500
Subject: [PATCH 07/32] replace josn reader sources member with local variable

---
 cpp/include/cudf/io/detail/json.hpp | 25 +++++++---------------
 cpp/src/io/functions.cpp            |  5 ++---
 cpp/src/io/json/reader_impl.cu      | 32 +++++++++++++++--------------
 cpp/src/io/json/reader_impl.hpp     | 13 ++++++------
 4 files changed, 33 insertions(+), 42 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index e6d8f2de483..f39f42626bc 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -46,29 +46,14 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  json_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
-   * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-                  json_reader_options const& options,
+  explicit reader(json_reader_options const& options,
                   rmm::cuda_stream_view stream,
                   rmm::mr::device_memory_resource* mr);
 
@@ -77,13 +62,17 @@ class reader {
    */
   ~reader();
 
-  /*
+  /**
    * @brief Reads and returns the entire data set.
    *
+   * @param[in] sources Input `datasource` objects to read the dataset from
    * @param[in] options Settings for controlling reading behavior
+   * @param[in] stream CUDA stream used for device memory operations and kernel launches
+   *
    * @return cudf::table object that contains the array of cudf::column.
    */
-  table_with_metadata read(json_reader_options const& options,
+  table_with_metadata read(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
+                           json_reader_options const& options,
                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 };
 
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 438cb1762c6..b4a0ae2761f 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -193,10 +193,9 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
                                       options.get_byte_range_offset(),
                                       options.get_byte_range_size_with_padding());
 
-  auto reader =
-    std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+  auto reader = std::make_unique<json::reader>(options, rmm::cuda_stream_default, mr);
 
-  return reader->read(options);
+  return reader->read(datasources, options);
 }
 
 table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index cfda7bb11dc..93c68752d2c 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -199,22 +199,22 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_obj
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
-void reader::impl::ingest_raw_input(std::vector<uint8_t>& buffer,
+void reader::impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
+                                    std::vector<uint8_t>& buffer,
                                     size_t range_offset,
                                     size_t range_size,
                                     size_t range_size_padded)
 {
   // Iterate through the user defined sources and read the contents into the local buffer
-  CUDF_EXPECTS(!sources_.empty(), "No sources were defined");
   size_t total_source_size = 0;
-  for (const auto& source : sources_) {
+  for (const auto& source : sources) {
     total_source_size += source->size();
   }
-  total_source_size = total_source_size - range_offset;
+  total_source_size = total_source_size - (range_offset * sources.size());
 
   buffer.resize(total_source_size);
   size_t bytes_read = 0;
-  for (const auto& source : sources_) {
+  for (const auto& source : sources) {
     if (!source->is_empty()) {
       auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
       bytes_read += source->host_read(range_offset, data_size, &buffer[bytes_read]);
@@ -611,11 +611,10 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), metadata_};
 }
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   json_reader_options const& options,
+reader::impl::impl(json_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : options_(options), mr_(mr), sources_(std::move(sources))
+  : options_(options), mr_(mr)
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
@@ -635,7 +634,8 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
  *
  * @return Table and its metadata
  */
-table_with_metadata reader::impl::read(json_reader_options const& options,
+table_with_metadata reader::impl::read(std::vector<std::unique_ptr<datasource>>& sources,
+                                       json_reader_options const& options,
                                        rmm::cuda_stream_view stream)
 {
   auto range_offset      = options.get_byte_range_offset();
@@ -644,7 +644,7 @@ table_with_metadata reader::impl::read(json_reader_options const& options,
 
   std::vector<uint8_t> buffer;
 
-  ingest_raw_input(buffer, range_offset, range_size, range_size_padded);
+  ingest_raw_input(sources, buffer, range_offset, range_size, range_size_padded);
 
   CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n");
 
@@ -669,21 +669,23 @@ table_with_metadata reader::impl::read(json_reader_options const& options,
 }
 
 // Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-               json_reader_options const& options,
+reader::reader(json_reader_options const& options,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
 {
-  _impl = std::make_unique<impl>(std::move(sources), options, stream, mr);
+  _impl = std::make_unique<impl>(options, stream, mr);
 }
 
 // Destructor within this translation unit
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(json_reader_options const& options, rmm::cuda_stream_view stream)
+table_with_metadata reader::read(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
+                                 json_reader_options const& options,
+                                 rmm::cuda_stream_view stream)
 {
-  return table_with_metadata{_impl->read(options, stream)};
+  CUDF_EXPECTS(not sources.empty(), "No sources were defined");
+  return table_with_metadata{_impl->read(sources, options, stream)};
 }
 }  // namespace json
 }  // namespace detail
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 5e07c38a4c7..25ff47a8d6a 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -56,8 +56,6 @@ class reader::impl {
 
   rmm::mr::device_memory_resource* mr_ = nullptr;
 
-  std::vector<std::unique_ptr<datasource>> sources_;
-
   const char* uncomp_data_ = nullptr;
   size_t uncomp_size_      = 0;
 
@@ -111,7 +109,8 @@ class reader::impl {
    * @param[in] range_size Bytes to read; use `0` for all remaining data
    * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
    */
-  void ingest_raw_input(std::vector<uint8_t>& buffer,
+  void ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
+                        std::vector<uint8_t>& buffer,
                         size_t range_offset,
                         size_t range_size,
                         size_t range_size_padded);
@@ -188,20 +187,22 @@ class reader::impl {
   /**
    * @brief Constructor from a dataset source with reader options.
    */
-  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                json_reader_options const& options,
+  explicit impl(json_reader_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Read an entire set or a subset of data from the source
    *
+   * @param[in] sources Input `datasource` objects to read the dataset from
    * @param[in] options Settings for controlling reading behavior
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return Table and its metadata
    */
-  table_with_metadata read(json_reader_options const& options, rmm::cuda_stream_view stream);
+  table_with_metadata read(std::vector<std::unique_ptr<datasource>>& sources,
+                           json_reader_options const& options,
+                           rmm::cuda_stream_view stream);
 };
 
 }  // namespace json

From 166c4d3c31edc386d737caa792c5d53523010d87 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 21:45:58 -0500
Subject: [PATCH 08/32] delete useless json reader wrapper class

---
 cpp/include/cudf/io/detail/json.hpp | 49 ++++++--------------
 cpp/src/io/functions.cpp            |  6 +--
 cpp/src/io/json/reader_impl.cu      | 69 +++++++++++++----------------
 cpp/src/io/json/reader_impl.hpp     |  8 ++--
 4 files changed, 48 insertions(+), 84 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index f39f42626bc..3a443b9b3d0 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -38,43 +38,20 @@ namespace detail {
 namespace json {
 
 /**
- * @brief Class to read JSON dataset data into columns.
+ * @brief Reads and returns the entire data set.
+ *
+ * @param[in] sources Input `datasource` objects to read the dataset from
+ * @param[in] options Settings for controlling reading behavior
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return cudf::table object that contains the array of cudf::column.
  */
-class reader {
- private:
-  class impl;
-  std::unique_ptr<impl> _impl;
-
- public:
-  /**
-   * @brief Constructor from an array of datasources
-   *
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(json_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~reader();
-
-  /**
-   * @brief Reads and returns the entire data set.
-   *
-   * @param[in] sources Input `datasource` objects to read the dataset from
-   * @param[in] options Settings for controlling reading behavior
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches
-   *
-   * @return cudf::table object that contains the array of cudf::column.
-   */
-  table_with_metadata read(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
-                           json_reader_options const& options,
-                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-};
+table_with_metadata read_json(
+  std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
+  json_reader_options const& options,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace json
 }  // namespace detail
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index b4a0ae2761f..db156144a61 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -183,8 +183,6 @@ compression_type infer_compression_type(compression_type compression, source_inf
 
 table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr)
 {
-  namespace json = cudf::io::detail::json;
-
   CUDF_FUNC_RANGE();
 
   options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
@@ -193,9 +191,7 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
                                       options.get_byte_range_offset(),
                                       options.get_byte_range_size_with_padding());
 
-  auto reader = std::make_unique<json::reader>(options, rmm::cuda_stream_default, mr);
-
-  return reader->read(datasources, options);
+  return detail::json::read_json(datasources, options, rmm::cuda_stream_default, mr);
 }
 
 table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 93c68752d2c..5eab32d68de 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -183,7 +183,7 @@ auto sort_keys_info_by_offset(std::unique_ptr<table> info)
  *
  * @return Names of JSON object keys in the file
  */
-std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_object_keys_hashes(
+std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_object_keys_hashes(
   device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream)
 {
   auto info = create_json_keys_info_table(
@@ -199,11 +199,11 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_obj
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
-void reader::impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
-                                    std::vector<uint8_t>& buffer,
-                                    size_t range_offset,
-                                    size_t range_size,
-                                    size_t range_size_padded)
+void reader_impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
+                                   std::vector<uint8_t>& buffer,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   size_t range_size_padded)
 {
   // Iterate through the user defined sources and read the contents into the local buffer
   size_t total_source_size = 0;
@@ -232,8 +232,7 @@ void reader::impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> con
  * Sets the uncomp_data_ and uncomp_size_ data members
  * Loads the data into device memory if byte range parameters are not used
  */
-void reader::impl::decompress_input(std::vector<uint8_t> const& buffer,
-                                    rmm::cuda_stream_view stream)
+void reader_impl::decompress_input(std::vector<uint8_t> const& buffer, rmm::cuda_stream_view stream)
 {
   if (options_.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
@@ -252,7 +251,7 @@ void reader::impl::decompress_input(std::vector<uint8_t> const& buffer,
   if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
 }
 
-rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_view stream)
+rmm::device_uvector<uint64_t> reader_impl::find_record_starts(rmm::cuda_stream_view stream)
 {
   std::vector<char> chars_to_count{'\n'};
   // Currently, ignoring lineterminations within quotes is handled by recording the records of both,
@@ -327,8 +326,8 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
  * Only rows that need to be parsed are copied, based on the byte range
  * Also updates the array of record starts to match the device data offset.
  */
-void reader::impl::upload_data_to_device(rmm::device_uvector<uint64_t>& rec_starts,
-                                         rmm::cuda_stream_view stream)
+void reader_impl::upload_data_to_device(rmm::device_uvector<uint64_t>& rec_starts,
+                                        rmm::cuda_stream_view stream)
 {
   size_t start_offset = 0;
   size_t end_offset   = uncomp_size_;
@@ -366,8 +365,8 @@ void reader::impl::upload_data_to_device(rmm::device_uvector<uint64_t>& rec_star
   data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream);
 }
 
-void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
-                                    rmm::cuda_stream_view stream)
+void reader_impl::set_column_names(device_span<uint64_t const> rec_starts,
+                                   rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
   uint64_t first_row_len = data_.size() / sizeof(char);
@@ -417,7 +416,7 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
   }
 }
 
-std::vector<data_type> reader::impl::parse_data_types(
+std::vector<data_type> reader_impl::parse_data_types(
   std::vector<std::string> const& types_as_strings)
 {
   CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(),
@@ -459,8 +458,8 @@ std::vector<data_type> reader::impl::parse_data_types(
   return dtypes;
 }
 
-void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
-                                  rmm::cuda_stream_view stream)
+void reader_impl::set_data_types(device_span<uint64_t const> rec_starts,
+                                 rmm::cuda_stream_view stream)
 {
   bool has_to_infer_column_types =
     std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
@@ -528,8 +527,8 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
   }
 }
 
-table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t const> rec_starts,
-                                                        rmm::cuda_stream_view stream)
+table_with_metadata reader_impl::convert_data_to_table(device_span<uint64_t const> rec_starts,
+                                                       rmm::cuda_stream_view stream)
 {
   const auto num_columns = dtypes_.size();
   const auto num_records = rec_starts.size();
@@ -611,9 +610,9 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), metadata_};
 }
 
-reader::impl::impl(json_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+reader_impl::reader_impl(json_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
   : options_(options), mr_(mr)
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
@@ -634,9 +633,9 @@ reader::impl::impl(json_reader_options const& options,
  *
  * @return Table and its metadata
  */
-table_with_metadata reader::impl::read(std::vector<std::unique_ptr<datasource>>& sources,
-                                       json_reader_options const& options,
-                                       rmm::cuda_stream_view stream)
+table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>& sources,
+                                      json_reader_options const& options,
+                                      rmm::cuda_stream_view stream)
 {
   auto range_offset      = options.get_byte_range_offset();
   auto range_size        = options.get_byte_range_size();
@@ -668,24 +667,16 @@ table_with_metadata reader::impl::read(std::vector<std::unique_ptr<datasource>>&
   return convert_data_to_table(rec_starts, stream);
 }
 
-// Forward to implementation
-reader::reader(json_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
+                              json_reader_options const& options,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
-  _impl = std::make_unique<impl>(options, stream, mr);
-}
+  CUDF_EXPECTS(not sources.empty(), "No sources were defined");
 
-// Destructor within this translation unit
-reader::~reader() = default;
+  auto impl = std::make_unique<reader_impl>(options, stream, mr);
 
-// Forward to implementation
-table_with_metadata reader::read(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
-                                 json_reader_options const& options,
-                                 rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS(not sources.empty(), "No sources were defined");
-  return table_with_metadata{_impl->read(sources, options, stream)};
+  return table_with_metadata{impl->read(sources, options, stream)};
 }
 }  // namespace json
 }  // namespace detail
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 25ff47a8d6a..bdeaa81ba78 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -49,7 +49,7 @@ using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_ma
 /**
  * @brief Class used to parse Json input and convert it into gdf columns.
  */
-class reader::impl {
+class reader_impl {
  public:
  private:
   const json_reader_options options_{};
@@ -187,9 +187,9 @@ class reader::impl {
   /**
    * @brief Constructor from a dataset source with reader options.
    */
-  explicit impl(json_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+  explicit reader_impl(json_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Read an entire set or a subset of data from the source

From b3ce52713900e2aab33a59c2404a1bc051c8323f Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 21:47:19 -0500
Subject: [PATCH 09/32] delete unused arrow namespace declaration

---
 cpp/include/cudf/io/detail/json.hpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 3a443b9b3d0..2417798d4af 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -25,13 +25,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-// Forward declarations
-namespace arrow {
-namespace io {
-class RandomAccessFile;
-}
-}  // namespace arrow
-
 namespace cudf {
 namespace io {
 namespace detail {

From 5a997b841cbe7f836b1300d796033f3dfa5551ec Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 21:51:51 -0500
Subject: [PATCH 10/32] replace json reader_impl mr member with argument

---
 cpp/include/cudf/io/detail/json.hpp |  2 +-
 cpp/src/io/json/reader_impl.cu      | 24 ++++++++++++------------
 cpp/src/io/json/reader_impl.hpp     | 14 +++++++-------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 2417798d4af..7ab8906e5a9 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -36,7 +36,7 @@ namespace json {
  * @param[in] sources Input `datasource` objects to read the dataset from
  * @param[in] options Settings for controlling reading behavior
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
+ * @param[in] mr Device memory resource to use for device memory allocation
  *
  * @return cudf::table object that contains the array of cudf::column.
  */
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 5eab32d68de..c2b1d5ed824 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -528,7 +528,8 @@ void reader_impl::set_data_types(device_span<uint64_t const> rec_starts,
 }
 
 table_with_metadata reader_impl::convert_data_to_table(device_span<uint64_t const> rec_starts,
-                                                       rmm::cuda_stream_view stream)
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
 {
   const auto num_columns = dtypes_.size();
   const auto num_records = rec_starts.size();
@@ -536,7 +537,7 @@ table_with_metadata reader_impl::convert_data_to_table(device_span<uint64_t cons
   // alloc output buffers.
   std::vector<column_buffer> out_buffers;
   for (size_t col = 0; col < num_columns; ++col) {
-    out_buffers.emplace_back(dtypes_[col], num_records, true, stream, mr_);
+    out_buffers.emplace_back(dtypes_[col], num_records, true, stream, mr);
   }
 
   thrust::host_vector<data_type> h_dtypes(num_columns);
@@ -591,11 +592,11 @@ table_with_metadata reader_impl::convert_data_to_table(device_span<uint64_t cons
   for (size_t i = 0; i < num_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
 
-    auto out_column = make_column(out_buffers[i], nullptr, stream, mr_);
+    auto out_column = make_column(out_buffers[i], nullptr, stream, mr);
     if (out_column->type().id() == type_id::STRING) {
       // Need to remove escape character in case of '\"' and '\\'
       out_columns.emplace_back(cudf::strings::detail::replace(
-        out_column->view(), target->view(), repl->view(), stream, mr_));
+        out_column->view(), target->view(), repl->view(), stream, mr));
     } else {
       out_columns.emplace_back(std::move(out_column));
     }
@@ -610,10 +611,8 @@ table_with_metadata reader_impl::convert_data_to_table(device_span<uint64_t cons
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), metadata_};
 }
 
-reader_impl::reader_impl(json_reader_options const& options,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
-  : options_(options), mr_(mr)
+reader_impl::reader_impl(json_reader_options const& options, rmm::cuda_stream_view stream)
+  : options_(options)
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
@@ -635,7 +634,8 @@ reader_impl::reader_impl(json_reader_options const& options,
  */
 table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>& sources,
                                       json_reader_options const& options,
-                                      rmm::cuda_stream_view stream)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   auto range_offset      = options.get_byte_range_offset();
   auto range_size        = options.get_byte_range_size();
@@ -664,7 +664,7 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   set_data_types(rec_starts, stream);
   CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n");
 
-  return convert_data_to_table(rec_starts, stream);
+  return convert_data_to_table(rec_starts, stream, mr);
 }
 
 table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
@@ -674,9 +674,9 @@ table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>
 {
   CUDF_EXPECTS(not sources.empty(), "No sources were defined");
 
-  auto impl = std::make_unique<reader_impl>(options, stream, mr);
+  auto impl = std::make_unique<reader_impl>(options, stream);
 
-  return table_with_metadata{impl->read(sources, options, stream)};
+  return table_with_metadata{impl->read(sources, options, stream, mr)};
 }
 }  // namespace json
 }  // namespace detail
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index bdeaa81ba78..4498b48741d 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -54,8 +54,6 @@ class reader_impl {
  private:
   const json_reader_options options_{};
 
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-
   const char* uncomp_data_ = nullptr;
   size_t uncomp_size_      = 0;
 
@@ -177,19 +175,19 @@ class reader_impl {
    *
    * @param[in] rec_starts Record starts in device memory
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+   * @param[in] mr Device memory resource to use for device memory allocation
    *
    * @return Table and its metadata
    */
   table_with_metadata convert_data_to_table(device_span<uint64_t const> rec_starts,
-                                            rmm::cuda_stream_view stream);
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
  public:
   /**
    * @brief Constructor from a dataset source with reader options.
    */
-  explicit reader_impl(json_reader_options const& options,
-                       rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr);
+  explicit reader_impl(json_reader_options const& options, rmm::cuda_stream_view stream);
 
   /**
    * @brief Read an entire set or a subset of data from the source
@@ -197,12 +195,14 @@ class reader_impl {
    * @param[in] sources Input `datasource` objects to read the dataset from
    * @param[in] options Settings for controlling reading behavior
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+   * @param[in] mr Device memory resource to use for device memory allocation
    *
    * @return Table and its metadata
    */
   table_with_metadata read(std::vector<std::unique_ptr<datasource>>& sources,
                            json_reader_options const& options,
-                           rmm::cuda_stream_view stream);
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr);
 };
 
 }  // namespace json

From 89163fb5b5748975b2d51a004c93a3489854abfe Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 22:07:46 -0500
Subject: [PATCH 11/32] replace json reader_impl options with local variable

---
 cpp/src/io/json/reader_impl.cu  | 55 ++++++++++++++++-----------------
 cpp/src/io/json/reader_impl.hpp | 16 +++++-----
 2 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index c2b1d5ed824..9a1ee49a7de 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -111,7 +111,7 @@ col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes,
  *
  * @return std::unique_ptr<table> cudf table with three columns (offsets, lengths, hashes)
  */
-std::unique_ptr<table> create_json_keys_info_table(const parse_options_view& options,
+std::unique_ptr<table> create_json_keys_info_table(parse_options_view const& parse_opts,
                                                    device_span<char const> const data,
                                                    device_span<uint64_t const> const row_offsets,
                                                    rmm::cuda_stream_view stream)
@@ -119,7 +119,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view& opt
   // Count keys
   rmm::device_scalar<unsigned long long int> key_counter(0, stream);
   cudf::io::json::gpu::collect_keys_info(
-    options, data, row_offsets, key_counter.data(), {}, stream);
+    parse_opts, data, row_offsets, key_counter.data(), {}, stream);
 
   // Allocate columns to store hash value, length, and offset of each JSON object key in the input
   auto const num_keys = key_counter.value(stream);
@@ -135,7 +135,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view& opt
   key_counter.set_value_to_zero_async(stream);
   // Fill the allocated columns
   cudf::io::json::gpu::collect_keys_info(
-    options, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
+    parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
   return info_table;
 }
 
@@ -232,9 +232,11 @@ void reader_impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> cons
  * Sets the uncomp_data_ and uncomp_size_ data members
  * Loads the data into device memory if byte range parameters are not used
  */
-void reader_impl::decompress_input(std::vector<uint8_t> const& buffer, rmm::cuda_stream_view stream)
+void reader_impl::decompress_input(json_reader_options const& read_opts,
+                                   std::vector<uint8_t> const& buffer,
+                                   rmm::cuda_stream_view stream)
 {
-  if (options_.get_compression() == compression_type::NONE) {
+  if (read_opts.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
     uncomp_data_ = reinterpret_cast<const char*>(buffer.data());
     uncomp_size_ = buffer.size();
@@ -243,7 +245,7 @@ void reader_impl::decompress_input(std::vector<uint8_t> const& buffer, rmm::cuda
       host_span<char const>(                     //
         reinterpret_cast<const char*>(buffer.data()),
         buffer.size()),
-      options_.get_compression());
+      read_opts.get_compression());
 
     uncomp_data_ = uncomp_data_owner_.data();
     uncomp_size_ = uncomp_data_owner_.size();
@@ -458,11 +460,12 @@ std::vector<data_type> reader_impl::parse_data_types(
   return dtypes;
 }
 
-void reader_impl::set_data_types(device_span<uint64_t const> rec_starts,
+void reader_impl::set_data_types(json_reader_options const& reader_opts,
+                                 device_span<uint64_t const> rec_starts,
                                  rmm::cuda_stream_view stream)
 {
   bool has_to_infer_column_types =
-    std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
+    std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
   if (!has_to_infer_column_types) {
     dtypes_ = std::visit(
       cudf::detail::visitor_overload{
@@ -480,7 +483,7 @@ void reader_impl::set_data_types(device_span<uint64_t const> rec_starts,
           return sorted_dtypes;
         },
         [&](std::vector<std::string> const& dtypes) { return parse_data_types(dtypes); }},
-      options_.get_dtypes());
+      reader_opts.get_dtypes());
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = metadata_.column_names.size();
@@ -611,18 +614,6 @@ table_with_metadata reader_impl::convert_data_to_table(device_span<uint64_t cons
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), metadata_};
 }
 
-reader_impl::reader_impl(json_reader_options const& options, rmm::cuda_stream_view stream)
-  : options_(options)
-{
-  CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
-
-  opts_.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
-  opts_.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
-  opts_.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
-
-  opts_.dayfirst = options.is_enabled_dayfirst();
-}
-
 /**
  * @brief Read an entire set or a subset of data from the source
  *
@@ -633,13 +624,21 @@ reader_impl::reader_impl(json_reader_options const& options, rmm::cuda_stream_vi
  * @return Table and its metadata
  */
 table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>& sources,
-                                      json_reader_options const& options,
+                                      json_reader_options const& reader_opts,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  auto range_offset      = options.get_byte_range_offset();
-  auto range_size        = options.get_byte_range_size();
-  auto range_size_padded = options.get_byte_range_size_with_padding();
+  CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
+
+  opts_.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  opts_.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  opts_.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+
+  opts_.dayfirst = reader_opts.is_enabled_dayfirst();
+
+  auto range_offset      = reader_opts.get_byte_range_offset();
+  auto range_size        = reader_opts.get_byte_range_size();
+  auto range_size_padded = reader_opts.get_byte_range_size_with_padding();
 
   std::vector<uint8_t> buffer;
 
@@ -647,7 +646,7 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
 
   CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n");
 
-  decompress_input(buffer, stream);
+  decompress_input(reader_opts, buffer, stream);
 
   CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n");
   CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n");
@@ -661,7 +660,7 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   set_column_names(rec_starts, stream);
   CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n");
 
-  set_data_types(rec_starts, stream);
+  set_data_types(reader_opts, rec_starts, stream);
   CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n");
 
   return convert_data_to_table(rec_starts, stream, mr);
@@ -674,7 +673,7 @@ table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>
 {
   CUDF_EXPECTS(not sources.empty(), "No sources were defined");
 
-  auto impl = std::make_unique<reader_impl>(options, stream);
+  auto impl = std::make_unique<reader_impl>();
 
   return table_with_metadata{impl->read(sources, options, stream, mr)};
 }
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 4498b48741d..6cfc85d880e 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -52,8 +52,6 @@ using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_ma
 class reader_impl {
  public:
  private:
-  const json_reader_options options_{};
-
   const char* uncomp_data_ = nullptr;
   size_t uncomp_size_      = 0;
 
@@ -126,7 +124,9 @@ class reader_impl {
    *
    * Sets the uncomp_data_ and uncomp_size_ data members
    */
-  void decompress_input(std::vector<uint8_t> const& buffer, rmm::cuda_stream_view stream);
+  void decompress_input(json_reader_options const& options,
+                        std::vector<uint8_t> const& buffer,
+                        rmm::cuda_stream_view stream);
 
   /**
    * @brief Finds all record starts in the file.
@@ -165,10 +165,13 @@ class reader_impl {
    *
    * If user does not pass the data types, deduces types from the file content
    *
+   * @param[in] reader_opts Settings for controlling reading behavior
    * @param[in] rec_starts Record starts in device memory
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  void set_data_types(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
+  void set_data_types(json_reader_options const& reader_opts,
+                      device_span<uint64_t const> rec_starts,
+                      rmm::cuda_stream_view stream);
 
   /**
    * @brief Parse the input data and store results a table
@@ -184,11 +187,6 @@ class reader_impl {
                                             rmm::mr::device_memory_resource* mr);
 
  public:
-  /**
-   * @brief Constructor from a dataset source with reader options.
-   */
-  explicit reader_impl(json_reader_options const& options, rmm::cuda_stream_view stream);
-
   /**
    * @brief Read an entire set or a subset of data from the source
    *

From 7ff3f1a9264e1b15c2e81eb7a4ff3b824567f0c3 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 22:11:51 -0500
Subject: [PATCH 12/32] remove unused json_reader::allow_newlines_in_strings_
 member

---
 cpp/src/io/json/reader_impl.cu  | 23 -----------------------
 cpp/src/io/json/reader_impl.hpp |  1 -
 2 files changed, 24 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 9a1ee49a7de..d1ca067aaf5 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -259,7 +259,6 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(rmm::cuda_stream_v
   // Currently, ignoring lineterminations within quotes is handled by recording the records of both,
   // and then filtering out the records that is a quotechar or a linetermination within a quotechar
   // pair.
-  if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); }
   // If not starting at an offset, add an extra row to account for the first row in the file
   cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0);
   if (load_whole_source_) {
@@ -278,7 +277,6 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(rmm::cuda_stream_v
   }
 
   std::vector<char> chars_to_find{'\n'};
-  if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); }
   // Passing offset = 1 to return positions AFTER the found character
   if (load_whole_source_) {
     find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream);
@@ -292,27 +290,6 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(rmm::cuda_stream_v
   thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end());
 
   auto filtered_count = prefilter_count;
-  if (allow_newlines_in_strings_) {
-    auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
-    bool quotation    = false;
-    for (cudf::size_type i = 1; i < prefilter_count; ++i) {
-      if (uncomp_data_[h_rec_starts[i] - 1] == '\"') {
-        quotation       = !quotation;
-        h_rec_starts[i] = uncomp_size_;
-        filtered_count--;
-      } else if (quotation) {
-        h_rec_starts[i] = uncomp_size_;
-        filtered_count--;
-      }
-    }
-    CUDA_TRY(cudaMemcpyAsync(rec_starts.data(),
-                             h_rec_starts.data(),
-                             h_rec_starts.size() * sizeof(uint64_t),
-                             cudaMemcpyDefault,
-                             stream.value()));
-    thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end());
-    stream.synchronize();
-  }
 
   // Exclude the ending newline as it does not precede a record start
   if (uncomp_data_[uncomp_size_ - 1] == '\n') { filtered_count--; }
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 6cfc85d880e..3923890f583 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -72,7 +72,6 @@ class reader_impl {
   std::unique_ptr<rmm::device_scalar<col_map_type>> d_key_col_map_;
 
   // parsing options
-  const bool allow_newlines_in_strings_ = false;
   parse_options opts_{',', '\n', '\"', '.'};
 
   /**

From 5c95398ba19a82910a2d6fd946ce54d86443708c Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 22:22:42 -0500
Subject: [PATCH 13/32] replace json reader_impl::opts_ with local variable

---
 cpp/src/io/json/reader_impl.cu  | 40 ++++++++++++++++++++-------------
 cpp/src/io/json/reader_impl.hpp | 15 ++++++++-----
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index d1ca067aaf5..25fd17894a4 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -184,10 +184,12 @@ auto sort_keys_info_by_offset(std::unique_ptr<table> info)
  * @return Names of JSON object keys in the file
  */
 std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_object_keys_hashes(
-  device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream)
+  parse_options_view const& parse_opts,
+  device_span<uint64_t const> rec_starts,
+  rmm::cuda_stream_view stream)
 {
   auto info = create_json_keys_info_table(
-    opts_.view(),
+    parse_opts,
     device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
     rec_starts,
     stream);
@@ -344,7 +346,8 @@ void reader_impl::upload_data_to_device(rmm::device_uvector<uint64_t>& rec_start
   data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream);
 }
 
-void reader_impl::set_column_names(device_span<uint64_t const> rec_starts,
+void reader_impl::set_column_names(parse_options_view const& parse_opts,
+                                   device_span<uint64_t const> rec_starts,
                                    rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
@@ -376,7 +379,7 @@ void reader_impl::set_column_names(device_span<uint64_t const> rec_starts,
   // If the first opening bracket is '{', assume object format
   if (first_curly_bracket < first_square_bracket) {
     // use keys as column names if input rows are objects
-    auto keys_desc         = get_json_object_keys_hashes(rec_starts, stream);
+    auto keys_desc         = get_json_object_keys_hashes(parse_opts, rec_starts, stream);
     metadata_.column_names = keys_desc.first;
     set_column_map(std::move(keys_desc.second), stream);
   } else {
@@ -384,11 +387,12 @@ void reader_impl::set_column_names(device_span<uint64_t const> rec_starts,
     bool quotation = false;
     for (size_t pos = 0; pos < first_row.size(); ++pos) {
       // Flip the quotation flag if current character is a quotechar
-      if (first_row[pos] == opts_.quotechar) {
+      if (first_row[pos] == parse_opts.quotechar) {
         quotation = !quotation;
       }
       // Check if end of a column/row
-      else if (pos == first_row.size() - 1 || (!quotation && first_row[pos] == opts_.delimiter)) {
+      else if (pos == first_row.size() - 1 ||
+               (!quotation && first_row[pos] == parse_opts.delimiter)) {
         metadata_.column_names.emplace_back(std::to_string(cols_found++));
       }
     }
@@ -438,6 +442,7 @@ std::vector<data_type> reader_impl::parse_data_types(
 }
 
 void reader_impl::set_data_types(json_reader_options const& reader_opts,
+                                 parse_options_view const& parse_opts,
                                  device_span<uint64_t const> rec_starts,
                                  rmm::cuda_stream_view stream)
 {
@@ -467,7 +472,7 @@ void reader_impl::set_data_types(json_reader_options const& reader_opts,
     auto const do_set_null_count = key_to_col_idx_map_ != nullptr;
 
     auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
-      opts_.view(),
+      parse_opts,
       device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
       rec_starts,
       do_set_null_count,
@@ -507,7 +512,8 @@ void reader_impl::set_data_types(json_reader_options const& reader_opts,
   }
 }
 
-table_with_metadata reader_impl::convert_data_to_table(device_span<uint64_t const> rec_starts,
+table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts,
+                                                       device_span<uint64_t const> rec_starts,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::mr::device_memory_resource* mr)
 {
@@ -537,7 +543,7 @@ table_with_metadata reader_impl::convert_data_to_table(device_span<uint64_t cons
     cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(num_columns, stream);
 
   cudf::io::json::gpu::convert_json_to_columns(
-    opts_.view(),
+    parse_opts,
     device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
     rec_starts,
     d_dtypes,
@@ -607,11 +613,13 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
 {
   CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
-  opts_.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
-  opts_.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
-  opts_.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+  auto parse_opts = parse_options{',', '\n', '\"', '.'};
 
-  opts_.dayfirst = reader_opts.is_enabled_dayfirst();
+  parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  parse_opts.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+
+  parse_opts.dayfirst = reader_opts.is_enabled_dayfirst();
 
   auto range_offset      = reader_opts.get_byte_range_offset();
   auto range_size        = reader_opts.get_byte_range_size();
@@ -634,13 +642,13 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   upload_data_to_device(rec_starts, stream);
   CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n");
 
-  set_column_names(rec_starts, stream);
+  set_column_names(parse_opts.view(), rec_starts, stream);
   CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n");
 
-  set_data_types(reader_opts, rec_starts, stream);
+  set_data_types(reader_opts, parse_opts.view(), rec_starts, stream);
   CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n");
 
-  return convert_data_to_table(rec_starts, stream, mr);
+  return convert_data_to_table(parse_opts.view(), rec_starts, stream, mr);
 }
 
 table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 3923890f583..3444f33bc62 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -71,9 +71,6 @@ class reader_impl {
   col_map_ptr_type key_to_col_idx_map_;
   std::unique_ptr<rmm::device_scalar<col_map_type>> d_key_col_map_;
 
-  // parsing options
-  parse_options opts_{',', '\n', '\"', '.'};
-
   /**
    * @brief Sets the column map data member and makes a device copy to be used as a kernel
    * parameter.
@@ -116,7 +113,9 @@ class reader_impl {
    * @return Array of keys and a map that maps their hash values to column indices
    */
   std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
-    device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
+    parse_options_view const& parse_opts,
+    device_span<uint64_t const> rec_starts,
+    rmm::cuda_stream_view stream);
 
   /**
    * @brief Decompress the input data, if needed
@@ -155,7 +154,9 @@ class reader_impl {
    * @param[in] rec_starts Record starts in device memory
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
+  void set_column_names(parse_options_view const& parse_opts,
+                        device_span<uint64_t const> rec_starts,
+                        rmm::cuda_stream_view stream);
 
   std::vector<data_type> parse_data_types(std::vector<std::string> const& types_as_strings);
 
@@ -169,6 +170,7 @@ class reader_impl {
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
   void set_data_types(json_reader_options const& reader_opts,
+                      parse_options_view const& parse_opts,
                       device_span<uint64_t const> rec_starts,
                       rmm::cuda_stream_view stream);
 
@@ -181,7 +183,8 @@ class reader_impl {
    *
    * @return Table and its metadata
    */
-  table_with_metadata convert_data_to_table(device_span<uint64_t const> rec_starts,
+  table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
+                                            device_span<uint64_t const> rec_starts,
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr);
 

From 0bde0b1ce6de927da911b05052ac6d8d4c30cc7b Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 22:34:16 -0500
Subject: [PATCH 14/32] remove json reader_impl byte_range members in place of
 local variables

---
 cpp/src/io/json/reader_impl.cu  | 42 +++++++++++++++++++--------------
 cpp/src/io/json/reader_impl.hpp | 10 ++++----
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 25fd17894a4..c5f08797aea 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -222,10 +222,12 @@ void reader_impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> cons
       bytes_read += source->host_read(range_offset, data_size, &buffer[bytes_read]);
     }
   }
+}
 
-  byte_range_offset_ = range_offset;
-  byte_range_size_   = range_size;
-  load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0;
+bool should_load_whole_source(json_reader_options const& reader_opts)
+{
+  return reader_opts.get_byte_range_offset() == 0 and  //
+         reader_opts.get_byte_range_size() == 0;
 }
 
 /**
@@ -234,11 +236,11 @@ void reader_impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> cons
  * Sets the uncomp_data_ and uncomp_size_ data members
  * Loads the data into device memory if byte range parameters are not used
  */
-void reader_impl::decompress_input(json_reader_options const& read_opts,
+void reader_impl::decompress_input(json_reader_options const& reader_opts,
                                    std::vector<uint8_t> const& buffer,
                                    rmm::cuda_stream_view stream)
 {
-  if (read_opts.get_compression() == compression_type::NONE) {
+  if (reader_opts.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
     uncomp_data_ = reinterpret_cast<const char*>(buffer.data());
     uncomp_size_ = buffer.size();
@@ -247,23 +249,26 @@ void reader_impl::decompress_input(json_reader_options const& read_opts,
       host_span<char const>(                     //
         reinterpret_cast<const char*>(buffer.data()),
         buffer.size()),
-      read_opts.get_compression());
+      reader_opts.get_compression());
 
     uncomp_data_ = uncomp_data_owner_.data();
     uncomp_size_ = uncomp_data_owner_.size();
   }
-  if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+  if (should_load_whole_source(reader_opts)) {
+    data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+  }
 }
 
-rmm::device_uvector<uint64_t> reader_impl::find_record_starts(rmm::cuda_stream_view stream)
+rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
+  json_reader_options const& reader_opts, rmm::cuda_stream_view stream)
 {
   std::vector<char> chars_to_count{'\n'};
   // Currently, ignoring lineterminations within quotes is handled by recording the records of both,
   // and then filtering out the records that is a quotechar or a linetermination within a quotechar
   // pair.
   // If not starting at an offset, add an extra row to account for the first row in the file
-  cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0);
-  if (load_whole_source_) {
+  cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0);
+  if (should_load_whole_source(reader_opts)) {
     prefilter_count += count_all_from_set(data_, chars_to_count, stream);
   } else {
     prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream);
@@ -273,14 +278,14 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(rmm::cuda_stream_v
 
   auto* find_result_ptr = rec_starts.data();
   // Manually adding an extra row to account for the first row in the file
-  if (byte_range_offset_ == 0) {
+  if (reader_opts.get_byte_range_offset() == 0) {
     find_result_ptr++;
     CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value()));
   }
 
   std::vector<char> chars_to_find{'\n'};
   // Passing offset = 1 to return positions AFTER the found character
-  if (load_whole_source_) {
+  if (should_load_whole_source(reader_opts)) {
     find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream);
   } else {
     find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream);
@@ -307,19 +312,20 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(rmm::cuda_stream_v
  * Only rows that need to be parsed are copied, based on the byte range
  * Also updates the array of record starts to match the device data offset.
  */
-void reader_impl::upload_data_to_device(rmm::device_uvector<uint64_t>& rec_starts,
+void reader_impl::upload_data_to_device(json_reader_options const& reader_opts,
+                                        rmm::device_uvector<uint64_t>& rec_starts,
                                         rmm::cuda_stream_view stream)
 {
   size_t start_offset = 0;
   size_t end_offset   = uncomp_size_;
 
   // Trim lines that are outside range
-  if (byte_range_size_ != 0 || byte_range_offset_ != 0) {
+  if (reader_opts.get_byte_range_size() != 0 || reader_opts.get_byte_range_offset() != 0) {
     auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
 
-    if (byte_range_size_ != 0) {
+    if (reader_opts.get_byte_range_size() != 0) {
       auto it = h_rec_starts.end() - 1;
-      while (it >= h_rec_starts.begin() && *it > byte_range_size_) {
+      while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) {
         end_offset = *it;
         --it;
       }
@@ -636,10 +642,10 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n");
   CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n");
 
-  auto rec_starts = find_record_starts(stream);
+  auto rec_starts = find_record_starts(reader_opts, stream);
   CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n");
 
-  upload_data_to_device(rec_starts, stream);
+  upload_data_to_device(reader_opts, rec_starts, stream);
   CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n");
 
   set_column_names(parse_opts.view(), rec_starts, stream);
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 3444f33bc62..1a4eb282b38 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -59,10 +59,6 @@ class reader_impl {
   std::vector<char> uncomp_data_owner_;
   rmm::device_buffer data_;
 
-  size_t byte_range_offset_ = 0;
-  size_t byte_range_size_   = 0;
-  bool load_whole_source_   = true;
-
   table_metadata metadata_;
   std::vector<data_type> dtypes_;
 
@@ -134,7 +130,8 @@ class reader_impl {
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    * @return Record starts in the device memory
    */
-  rmm::device_uvector<uint64_t> find_record_starts(rmm::cuda_stream_view stream);
+  rmm::device_uvector<uint64_t> find_record_starts(json_reader_options const& reader_opts,
+                                                   rmm::cuda_stream_view stream);
 
   /**
    * @brief Uploads the relevant segment of the input json data onto the GPU.
@@ -143,7 +140,8 @@ class reader_impl {
    * Only rows that need to be parsed are copied, based on the byte range
    * Also updates the array of record starts to match the device data offset.
    */
-  void upload_data_to_device(rmm::device_uvector<uint64_t>& rec_starts,
+  void upload_data_to_device(json_reader_options const& reader_opts,
+                             rmm::device_uvector<uint64_t>& rec_starts,
                              rmm::cuda_stream_view stream);
 
   /**

From b4843f5e9edec7ed88d7dda1420ebccc5ca4c246 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 19 Aug 2021 22:58:24 -0500
Subject: [PATCH 15/32] replace json reader_impl::dtypes_ with local variable

---
 cpp/src/io/json/reader_impl.cu  | 31 +++++++++++++++++++------------
 cpp/src/io/json/reader_impl.hpp | 10 +++++-----
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index c5f08797aea..0eaebd04008 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -447,15 +447,16 @@ std::vector<data_type> reader_impl::parse_data_types(
   return dtypes;
 }
 
-void reader_impl::set_data_types(json_reader_options const& reader_opts,
-                                 parse_options_view const& parse_opts,
-                                 device_span<uint64_t const> rec_starts,
-                                 rmm::cuda_stream_view stream)
+std::vector<data_type> reader_impl::get_data_types(json_reader_options const& reader_opts,
+                                                   parse_options_view const& parse_opts,
+                                                   device_span<uint64_t const> rec_starts,
+                                                   rmm::cuda_stream_view stream)
 {
   bool has_to_infer_column_types =
     std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
+
   if (!has_to_infer_column_types) {
-    dtypes_ = std::visit(
+    return std::visit(
       cudf::detail::visitor_overload{
         [&](const std::vector<data_type>& dtypes) { return dtypes; },
         [&](const std::map<std::string, data_type>& dtypes) {
@@ -511,25 +512,30 @@ void reader_impl::set_data_types(json_reader_options const& reader_opts,
       }
     };
 
+    std::vector<data_type> dtypes;
+
     std::transform(std::cbegin(h_column_infos),
                    std::cend(h_column_infos),
-                   std::back_inserter(dtypes_),
+                   std::back_inserter(dtypes),
                    [&](auto const& cinfo) { return data_type{get_type_id(cinfo)}; });
+
+    return dtypes;
   }
 }
 
 table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts,
+                                                       std::vector<data_type> const& dtypes,
                                                        device_span<uint64_t const> rec_starts,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::mr::device_memory_resource* mr)
 {
-  const auto num_columns = dtypes_.size();
+  const auto num_columns = dtypes.size();
   const auto num_records = rec_starts.size();
 
   // alloc output buffers.
   std::vector<column_buffer> out_buffers;
   for (size_t col = 0; col < num_columns; ++col) {
-    out_buffers.emplace_back(dtypes_[col], num_records, true, stream, mr);
+    out_buffers.emplace_back(dtypes[col], num_records, true, stream, mr);
   }
 
   thrust::host_vector<data_type> h_dtypes(num_columns);
@@ -537,7 +543,7 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const&
   thrust::host_vector<bitmask_type*> h_valid(num_columns);
 
   for (size_t i = 0; i < num_columns; ++i) {
-    h_dtypes[i] = dtypes_[i];
+    h_dtypes[i] = dtypes[i];
     h_data[i]   = out_buffers[i].data();
     h_valid[i]  = out_buffers[i].null_mask();
   }
@@ -651,10 +657,11 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   set_column_names(parse_opts.view(), rec_starts, stream);
   CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n");
 
-  set_data_types(reader_opts, parse_opts.view(), rec_starts, stream);
-  CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n");
+  auto dtypes = get_data_types(reader_opts, parse_opts.view(), rec_starts, stream);
+
+  CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n");
 
-  return convert_data_to_table(parse_opts.view(), rec_starts, stream, mr);
+  return convert_data_to_table(parse_opts.view(), dtypes, rec_starts, stream, mr);
 }
 
 table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 1a4eb282b38..8d4900325b8 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -60,7 +60,6 @@ class reader_impl {
   rmm::device_buffer data_;
 
   table_metadata metadata_;
-  std::vector<data_type> dtypes_;
 
   // the map is only used for files with rows in object format; initialize to a dummy value so the
   // map object can be passed to the kernel in any case
@@ -167,10 +166,10 @@ class reader_impl {
    * @param[in] rec_starts Record starts in device memory
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  void set_data_types(json_reader_options const& reader_opts,
-                      parse_options_view const& parse_opts,
-                      device_span<uint64_t const> rec_starts,
-                      rmm::cuda_stream_view stream);
+  std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
+                                        parse_options_view const& parse_opts,
+                                        device_span<uint64_t const> rec_starts,
+                                        rmm::cuda_stream_view stream);
 
   /**
    * @brief Parse the input data and store results a table
@@ -182,6 +181,7 @@ class reader_impl {
    * @return Table and its metadata
    */
   table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
+                                            std::vector<data_type> const& dtypes,
                                             device_span<uint64_t const> rec_starts,
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr);

From f511e6800cd0b58b68fcb8bdf3113eca5cea8413 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 16:55:20 -0500
Subject: [PATCH 16/32] replace json reader_impl::metadata_ with local variable

---
 cpp/src/io/json/reader_impl.cu  | 77 ++++++++++++++++++---------------
 cpp/src/io/json/reader_impl.hpp | 13 +++---
 2 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 0eaebd04008..65179866595 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -352,9 +352,9 @@ void reader_impl::upload_data_to_device(json_reader_options const& reader_opts,
   data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream);
 }
 
-void reader_impl::set_column_names(parse_options_view const& parse_opts,
-                                   device_span<uint64_t const> rec_starts,
-                                   rmm::cuda_stream_view stream)
+std::vector<std::string> reader_impl::get_column_names(parse_options_view const& parse_opts,
+                                                       device_span<uint64_t const> rec_starts,
+                                                       rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
   uint64_t first_row_len = data_.size() / sizeof(char);
@@ -385,12 +385,13 @@ void reader_impl::set_column_names(parse_options_view const& parse_opts,
   // If the first opening bracket is '{', assume object format
   if (first_curly_bracket < first_square_bracket) {
     // use keys as column names if input rows are objects
-    auto keys_desc         = get_json_object_keys_hashes(parse_opts, rec_starts, stream);
-    metadata_.column_names = keys_desc.first;
+    auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, stream);
     set_column_map(std::move(keys_desc.second), stream);
+    return keys_desc.first;
   } else {
-    int cols_found = 0;
-    bool quotation = false;
+    int cols_found    = 0;
+    bool quotation    = false;
+    auto column_names = std::vector<std::string>();
     for (size_t pos = 0; pos < first_row.size(); ++pos) {
       // Flip the quotation flag if current character is a quotechar
       if (first_row[pos] == parse_opts.quotechar) {
@@ -399,16 +400,17 @@ void reader_impl::set_column_names(parse_options_view const& parse_opts,
       // Check if end of a column/row
       else if (pos == first_row.size() - 1 ||
                (!quotation && first_row[pos] == parse_opts.delimiter)) {
-        metadata_.column_names.emplace_back(std::to_string(cols_found++));
+        column_names.emplace_back(std::to_string(cols_found++));
       }
     }
+    return column_names;
   }
 }
 
 std::vector<data_type> reader_impl::parse_data_types(
-  std::vector<std::string> const& types_as_strings)
+  std::vector<std::string> const& column_names, std::vector<std::string> const& types_as_strings)
 {
-  CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(),
+  CUDF_EXPECTS(types_as_strings.size() == column_names.size(),
                "Need to specify the type of each column.\n");
   std::vector<data_type> dtypes;
   // Assume that the dtype is in dictionary format only if all elements contain a colon
@@ -434,8 +436,8 @@ std::vector<data_type> reader_impl::parse_data_types(
       });
 
     // Using the map here allows O(n log n) complexity
-    std::transform(std::cbegin(metadata_.column_names),
-                   std::cend(metadata_.column_names),
+    std::transform(std::cbegin(column_names),
+                   std::cend(column_names),
                    std::back_inserter(dtypes),
                    [&](auto const& column_name) { return col_type_map[column_name]; });
   } else {
@@ -449,6 +451,7 @@ std::vector<data_type> reader_impl::parse_data_types(
 
 std::vector<data_type> reader_impl::get_data_types(json_reader_options const& reader_opts,
                                                    parse_options_view const& parse_opts,
+                                                   std::vector<std::string> const& column_names,
                                                    device_span<uint64_t const> rec_starts,
                                                    rmm::cuda_stream_view stream)
 {
@@ -456,26 +459,28 @@ std::vector<data_type> reader_impl::get_data_types(json_reader_options const& re
     std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
 
   if (!has_to_infer_column_types) {
-    return std::visit(
-      cudf::detail::visitor_overload{
-        [&](const std::vector<data_type>& dtypes) { return dtypes; },
-        [&](const std::map<std::string, data_type>& dtypes) {
-          std::vector<data_type> sorted_dtypes;
-          std::transform(std::cbegin(metadata_.column_names),
-                         std::cend(metadata_.column_names),
-                         std::back_inserter(sorted_dtypes),
-                         [&](auto const& column_name) {
-                           auto const it = dtypes.find(column_name);
-                           CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
-                           return it->second;
-                         });
-          return sorted_dtypes;
-        },
-        [&](std::vector<std::string> const& dtypes) { return parse_data_types(dtypes); }},
-      reader_opts.get_dtypes());
+    return std::visit(cudf::detail::visitor_overload{
+                        [&](const std::vector<data_type>& dtypes) { return dtypes; },
+                        [&](const std::map<std::string, data_type>& dtypes) {
+                          std::vector<data_type> sorted_dtypes;
+                          std::transform(std::cbegin(column_names),
+                                         std::cend(column_names),
+                                         std::back_inserter(sorted_dtypes),
+                                         [&](auto const& column_name) {
+                                           auto const it = dtypes.find(column_name);
+                                           CUDF_EXPECTS(it != dtypes.end(),
+                                                        "Must specify types for all columns");
+                                           return it->second;
+                                         });
+                          return sorted_dtypes;
+                        },
+                        [&](std::vector<std::string> const& dtypes) {
+                          return parse_data_types(column_names, dtypes);
+                        }},
+                      reader_opts.get_dtypes());
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
-    auto const num_columns       = metadata_.column_names.size();
+    auto const num_columns       = column_names.size();
     auto const do_set_null_count = key_to_col_idx_map_ != nullptr;
 
     auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
@@ -525,6 +530,7 @@ std::vector<data_type> reader_impl::get_data_types(json_reader_options const& re
 
 table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts,
                                                        std::vector<data_type> const& dtypes,
+                                                       std::vector<std::string> const& column_names,
                                                        device_span<uint64_t const> rec_starts,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::mr::device_memory_resource* mr)
@@ -606,7 +612,7 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const&
 
   CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input");
 
-  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), metadata_};
+  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {column_names}};
 }
 
 /**
@@ -654,14 +660,15 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   upload_data_to_device(reader_opts, rec_starts, stream);
   CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n");
 
-  set_column_names(parse_opts.view(), rec_starts, stream);
-  CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n");
+  auto column_names = get_column_names(parse_opts.view(), rec_starts, stream);
+
+  CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n");
 
-  auto dtypes = get_data_types(reader_opts, parse_opts.view(), rec_starts, stream);
+  auto dtypes = get_data_types(reader_opts, parse_opts.view(), column_names, rec_starts, stream);
 
   CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n");
 
-  return convert_data_to_table(parse_opts.view(), dtypes, rec_starts, stream, mr);
+  return convert_data_to_table(parse_opts.view(), dtypes, column_names, rec_starts, stream, mr);
 }
 
 table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 8d4900325b8..75031ebb68a 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -59,8 +59,6 @@ class reader_impl {
   std::vector<char> uncomp_data_owner_;
   rmm::device_buffer data_;
 
-  table_metadata metadata_;
-
   // the map is only used for files with rows in object format; initialize to a dummy value so the
   // map object can be passed to the kernel in any case
   col_map_ptr_type key_to_col_idx_map_;
@@ -151,11 +149,12 @@ class reader_impl {
    * @param[in] rec_starts Record starts in device memory
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  void set_column_names(parse_options_view const& parse_opts,
-                        device_span<uint64_t const> rec_starts,
-                        rmm::cuda_stream_view stream);
+  std::vector<std::string> get_column_names(parse_options_view const& parse_opts,
+                                            device_span<uint64_t const> rec_starts,
+                                            rmm::cuda_stream_view stream);
 
-  std::vector<data_type> parse_data_types(std::vector<std::string> const& types_as_strings);
+  std::vector<data_type> parse_data_types(std::vector<std::string> const& column_names,
+                                          std::vector<std::string> const& types_as_strings);
 
   /**
    * @brief Set the data type array data member
@@ -168,6 +167,7 @@ class reader_impl {
    */
   std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
                                         parse_options_view const& parse_opts,
+                                        std::vector<std::string> const& column_names,
                                         device_span<uint64_t const> rec_starts,
                                         rmm::cuda_stream_view stream);
 
@@ -182,6 +182,7 @@ class reader_impl {
    */
   table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
                                             std::vector<data_type> const& dtypes,
+                                            std::vector<std::string> const& column_names,
                                             device_span<uint64_t const> rec_starts,
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr);

From 5e307b5bcb2e18f12ffd99a1ddb55497a8b68b24 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 17:30:48 -0500
Subject: [PATCH 17/32] replace json reader_impl::data_ with local variable

---
 cpp/src/io/json/reader_impl.cu         | 106 ++++++++++++++-----------
 cpp/src/io/json/reader_impl.hpp        |  18 +++--
 cpp/src/io/utilities/parsing_utils.cu  |  59 ++++++--------
 cpp/src/io/utilities/parsing_utils.cuh |  14 ++--
 4 files changed, 102 insertions(+), 95 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 65179866595..7166cb776c8 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -186,13 +186,10 @@ auto sort_keys_info_by_offset(std::unique_ptr<table> info)
 std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_object_keys_hashes(
   parse_options_view const& parse_opts,
   device_span<uint64_t const> rec_starts,
+  device_span<char const> data,
   rmm::cuda_stream_view stream)
 {
-  auto info = create_json_keys_info_table(
-    parse_opts,
-    device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
-    rec_starts,
-    stream);
+  auto info = create_json_keys_info_table(parse_opts, data, rec_starts, stream);
 
   auto aggregated_info = aggregate_keys_info(std::move(info));
   auto sorted_info     = sort_keys_info_by_offset(std::move(aggregated_info));
@@ -236,9 +233,9 @@ bool should_load_whole_source(json_reader_options const& reader_opts)
  * Sets the uncomp_data_ and uncomp_size_ data members
  * Loads the data into device memory if byte range parameters are not used
  */
-void reader_impl::decompress_input(json_reader_options const& reader_opts,
-                                   std::vector<uint8_t> const& buffer,
-                                   rmm::cuda_stream_view stream)
+rmm::device_buffer reader_impl::decompress_input(json_reader_options const& reader_opts,
+                                                 std::vector<uint8_t> const& buffer,
+                                                 rmm::cuda_stream_view stream)
 {
   if (reader_opts.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
@@ -255,12 +252,16 @@ void reader_impl::decompress_input(json_reader_options const& reader_opts,
     uncomp_size_ = uncomp_data_owner_.size();
   }
   if (should_load_whole_source(reader_opts)) {
-    data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+    return rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+  } else {
+    return {};
   }
 }
 
 rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
-  json_reader_options const& reader_opts, rmm::cuda_stream_view stream)
+  json_reader_options const& reader_opts,
+  device_span<char const> data,
+  rmm::cuda_stream_view stream)
 {
   std::vector<char> chars_to_count{'\n'};
   // Currently, ignoring lineterminations within quotes is handled by recording the records of both,
@@ -269,9 +270,10 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
   // If not starting at an offset, add an extra row to account for the first row in the file
   cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0);
   if (should_load_whole_source(reader_opts)) {
-    prefilter_count += count_all_from_set(data_, chars_to_count, stream);
+    prefilter_count += count_all_from_set(data, chars_to_count, stream);
   } else {
-    prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream);
+    prefilter_count +=
+      count_all_from_set(host_span<char const>(uncomp_data_, uncomp_size_), chars_to_count, stream);
   }
 
   rmm::device_uvector<uint64_t> rec_starts(prefilter_count, stream);
@@ -286,9 +288,14 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
   std::vector<char> chars_to_find{'\n'};
   // Passing offset = 1 to return positions AFTER the found character
   if (should_load_whole_source(reader_opts)) {
-    find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream);
+    find_all_from_set(data, chars_to_find, 1, find_result_ptr, stream);
   } else {
-    find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream);
+    find_all_from_set(  //
+      host_span<char const>(uncomp_data_, uncomp_size_),
+      chars_to_find,
+      1,
+      find_result_ptr,
+      stream);
   }
 
   // Previous call stores the record pinput_file.typeositions as encountered by all threads
@@ -312,9 +319,9 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
  * Only rows that need to be parsed are copied, based on the byte range
  * Also updates the array of record starts to match the device data offset.
  */
-void reader_impl::upload_data_to_device(json_reader_options const& reader_opts,
-                                        rmm::device_uvector<uint64_t>& rec_starts,
-                                        rmm::cuda_stream_view stream)
+rmm::device_buffer reader_impl::upload_data_to_device(json_reader_options const& reader_opts,
+                                                      rmm::device_uvector<uint64_t>& rec_starts,
+                                                      rmm::cuda_stream_view stream)
 {
   size_t start_offset = 0;
   size_t end_offset   = uncomp_size_;
@@ -349,15 +356,16 @@ void reader_impl::upload_data_to_device(json_reader_options const& reader_opts,
                "Error finding the record within the specified byte range.\n");
 
   // Upload the raw data that is within the rows of interest
-  data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream);
+  return rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream);
 }
 
 std::vector<std::string> reader_impl::get_column_names(parse_options_view const& parse_opts,
                                                        device_span<uint64_t const> rec_starts,
+                                                       device_span<char const> data,
                                                        rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
-  uint64_t first_row_len = data_.size() / sizeof(char);
+  uint64_t first_row_len = data.size() / sizeof(char);
   if (rec_starts.size() > 1) {
     // Set first_row_len to the offset of the second row, if it exists
     CUDA_TRY(cudaMemcpyAsync(&first_row_len,
@@ -368,7 +376,7 @@ std::vector<std::string> reader_impl::get_column_names(parse_options_view const&
   }
   std::vector<char> first_row(first_row_len);
   CUDA_TRY(cudaMemcpyAsync(first_row.data(),
-                           data_.data(),
+                           data.data(),
                            first_row_len * sizeof(char),
                            cudaMemcpyDeviceToHost,
                            stream.value()));
@@ -385,7 +393,7 @@ std::vector<std::string> reader_impl::get_column_names(parse_options_view const&
   // If the first opening bracket is '{', assume object format
   if (first_curly_bracket < first_square_bracket) {
     // use keys as column names if input rows are objects
-    auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, stream);
+    auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, data, stream);
     set_column_map(std::move(keys_desc.second), stream);
     return keys_desc.first;
   } else {
@@ -453,6 +461,7 @@ std::vector<data_type> reader_impl::get_data_types(json_reader_options const& re
                                                    parse_options_view const& parse_opts,
                                                    std::vector<std::string> const& column_names,
                                                    device_span<uint64_t const> rec_starts,
+                                                   device_span<char const> data,
                                                    rmm::cuda_stream_view stream)
 {
   bool has_to_infer_column_types =
@@ -483,14 +492,13 @@ std::vector<data_type> reader_impl::get_data_types(json_reader_options const& re
     auto const num_columns       = column_names.size();
     auto const do_set_null_count = key_to_col_idx_map_ != nullptr;
 
-    auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
-      parse_opts,
-      device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
-      rec_starts,
-      do_set_null_count,
-      num_columns,
-      get_column_map_device_ptr(),
-      stream);
+    auto const h_column_infos = cudf::io::json::gpu::detect_data_types(parse_opts,
+                                                                       data,
+                                                                       rec_starts,
+                                                                       do_set_null_count,
+                                                                       num_columns,
+                                                                       get_column_map_device_ptr(),
+                                                                       stream);
 
     auto get_type_id = [&](auto const& cinfo) {
       auto int_count_total =
@@ -532,6 +540,7 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const&
                                                        std::vector<data_type> const& dtypes,
                                                        std::vector<std::string> const& column_names,
                                                        device_span<uint64_t const> rec_starts,
+                                                       device_span<char const> data,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::mr::device_memory_resource* mr)
 {
@@ -560,16 +569,15 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const&
   auto d_valid_counts =
     cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(num_columns, stream);
 
-  cudf::io::json::gpu::convert_json_to_columns(
-    parse_opts,
-    device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
-    rec_starts,
-    d_dtypes,
-    get_column_map_device_ptr(),
-    d_data,
-    d_valid,
-    d_valid_counts,
-    stream);
+  cudf::io::json::gpu::convert_json_to_columns(parse_opts,
+                                               data,
+                                               rec_starts,
+                                               d_dtypes,
+                                               get_column_map_device_ptr(),
+                                               d_data,
+                                               d_valid,
+                                               d_valid_counts,
+                                               stream);
 
   stream.synchronize();
 
@@ -649,26 +657,32 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
 
   CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n");
 
-  decompress_input(reader_opts, buffer, stream);
+  auto data      = decompress_input(reader_opts, buffer, stream);
+  auto data_span = device_span<char>(static_cast<char*>(data.data()), data.size());
 
   CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n");
   CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n");
 
-  auto rec_starts = find_record_starts(reader_opts, stream);
+  auto rec_starts = find_record_starts(reader_opts, data_span, stream);
+
   CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n");
 
-  upload_data_to_device(reader_opts, rec_starts, stream);
-  CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n");
+  data      = upload_data_to_device(reader_opts, rec_starts, stream);
+  data_span = device_span<char>(static_cast<char*>(data.data()), data.size());
+
+  CUDF_EXPECTS(data_span.size() != 0, "Error uploading input data to the GPU.\n");
 
-  auto column_names = get_column_names(parse_opts.view(), rec_starts, stream);
+  auto column_names = get_column_names(parse_opts.view(), rec_starts, data_span, stream);
 
   CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n");
 
-  auto dtypes = get_data_types(reader_opts, parse_opts.view(), column_names, rec_starts, stream);
+  auto dtypes =
+    get_data_types(reader_opts, parse_opts.view(), column_names, rec_starts, data_span, stream);
 
   CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n");
 
-  return convert_data_to_table(parse_opts.view(), dtypes, column_names, rec_starts, stream, mr);
+  return convert_data_to_table(
+    parse_opts.view(), dtypes, column_names, rec_starts, data_span, stream, mr);
 }
 
 table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 75031ebb68a..fa464a0ef8b 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -57,7 +57,6 @@ class reader_impl {
 
   // Used when the input data is compressed, to ensure the allocated uncompressed data is freed
   std::vector<char> uncomp_data_owner_;
-  rmm::device_buffer data_;
 
   // the map is only used for files with rows in object format; initialize to a dummy value so the
   // map object can be passed to the kernel in any case
@@ -108,6 +107,7 @@ class reader_impl {
   std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
     parse_options_view const& parse_opts,
     device_span<uint64_t const> rec_starts,
+    device_span<char const> data,
     rmm::cuda_stream_view stream);
 
   /**
@@ -115,9 +115,9 @@ class reader_impl {
    *
    * Sets the uncomp_data_ and uncomp_size_ data members
    */
-  void decompress_input(json_reader_options const& options,
-                        std::vector<uint8_t> const& buffer,
-                        rmm::cuda_stream_view stream);
+  rmm::device_buffer decompress_input(json_reader_options const& options,
+                                      std::vector<uint8_t> const& buffer,
+                                      rmm::cuda_stream_view stream);
 
   /**
    * @brief Finds all record starts in the file.
@@ -128,6 +128,7 @@ class reader_impl {
    * @return Record starts in the device memory
    */
   rmm::device_uvector<uint64_t> find_record_starts(json_reader_options const& reader_opts,
+                                                   device_span<char const> data,
                                                    rmm::cuda_stream_view stream);
 
   /**
@@ -137,9 +138,9 @@ class reader_impl {
    * Only rows that need to be parsed are copied, based on the byte range
    * Also updates the array of record starts to match the device data offset.
    */
-  void upload_data_to_device(json_reader_options const& reader_opts,
-                             rmm::device_uvector<uint64_t>& rec_starts,
-                             rmm::cuda_stream_view stream);
+  rmm::device_buffer upload_data_to_device(json_reader_options const& reader_opts,
+                                           rmm::device_uvector<uint64_t>& rec_starts,
+                                           rmm::cuda_stream_view stream);
 
   /**
    * @brief Parse the first row to set the column name
@@ -151,6 +152,7 @@ class reader_impl {
    */
   std::vector<std::string> get_column_names(parse_options_view const& parse_opts,
                                             device_span<uint64_t const> rec_starts,
+                                            device_span<char const> data,
                                             rmm::cuda_stream_view stream);
 
   std::vector<data_type> parse_data_types(std::vector<std::string> const& column_names,
@@ -169,6 +171,7 @@ class reader_impl {
                                         parse_options_view const& parse_opts,
                                         std::vector<std::string> const& column_names,
                                         device_span<uint64_t const> rec_starts,
+                                        device_span<char const> data,
                                         rmm::cuda_stream_view stream);
 
   /**
@@ -184,6 +187,7 @@ class reader_impl {
                                             std::vector<data_type> const& dtypes,
                                             std::vector<std::string> const& column_names,
                                             device_span<uint64_t const> rec_starts,
+                                            device_span<char const> data,
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index ba62238c5d3..2edf2d7505e 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -100,8 +100,8 @@ __global__ void count_and_set_positions(const char* data,
 }  // namespace
 
 template <class T>
-cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
-                                  const std::vector<char>& keys,
+cudf::size_type find_all_from_set(device_span<char const> data,
+                                  std::vector<char> const& keys,
                                   uint64_t result_offset,
                                   T* positions,
                                   rmm::cuda_stream_view stream)
@@ -110,31 +110,25 @@ cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
   int min_grid_size = 0;  // minimum block count required
   CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
-  const int grid_size = divCeil(d_data.size(), (size_t)block_size);
+  const int grid_size = divCeil(data.size(), (size_t)block_size);
 
   auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
   for (char key : keys) {
-    count_and_set_positions<T>
-      <<<grid_size, block_size, 0, stream.value()>>>(static_cast<const char*>(d_data.data()),
-                                                     d_data.size(),
-                                                     result_offset,
-                                                     key,
-                                                     d_count.data(),
-                                                     positions);
+    count_and_set_positions<T><<<grid_size, block_size, 0, stream.value()>>>(
+      data.data(), data.size(), result_offset, key, d_count.data(), positions);
   }
 
   return cudf::detail::make_std_vector_sync(d_count, stream)[0];
 }
 
 template <class T>
-cudf::size_type find_all_from_set(const char* h_data,
-                                  size_t h_size,
+cudf::size_type find_all_from_set(host_span<char const> data,
                                   const std::vector<char>& keys,
                                   uint64_t result_offset,
                                   T* positions,
                                   rmm::cuda_stream_view stream)
 {
-  rmm::device_buffer d_chunk(std::min(max_chunk_bytes, h_size), stream);
+  rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream);
   auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
 
   int block_size    = 0;  // suggested thread count to use
@@ -142,13 +136,13 @@ cudf::size_type find_all_from_set(const char* h_data,
   CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
 
-  const size_t chunk_count = divCeil(h_size, max_chunk_bytes);
+  const size_t chunk_count = divCeil(data.size(), max_chunk_bytes);
   for (size_t ci = 0; ci < chunk_count; ++ci) {
     const auto chunk_offset = ci * max_chunk_bytes;
-    const auto h_chunk      = h_data + chunk_offset;
-    const int chunk_bytes   = std::min((size_t)(h_size - ci * max_chunk_bytes), max_chunk_bytes);
-    const auto chunk_bits   = divCeil(chunk_bytes, bytes_per_find_thread);
-    const int grid_size     = divCeil(chunk_bits, block_size);
+    const auto h_chunk      = data.data() + chunk_offset;
+    const int chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes);
+    const auto chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread);
+    const int grid_size   = divCeil(chunk_bits, block_size);
 
     // Copy chunk to device
     CUDA_TRY(
@@ -168,45 +162,42 @@ cudf::size_type find_all_from_set(const char* h_data,
   return cudf::detail::make_std_vector_sync(d_count, stream)[0];
 }
 
-template cudf::size_type find_all_from_set<uint64_t>(const rmm::device_buffer& d_data,
-                                                     const std::vector<char>& keys,
+template cudf::size_type find_all_from_set<uint64_t>(device_span<char const> data,
+                                                     std::vector<char> const& keys,
                                                      uint64_t result_offset,
                                                      uint64_t* positions,
                                                      rmm::cuda_stream_view stream);
 
-template cudf::size_type find_all_from_set<pos_key_pair>(const rmm::device_buffer& d_data,
-                                                         const std::vector<char>& keys,
+template cudf::size_type find_all_from_set<pos_key_pair>(device_span<char const> data,
+                                                         std::vector<char> const& keys,
                                                          uint64_t result_offset,
                                                          pos_key_pair* positions,
                                                          rmm::cuda_stream_view stream);
 
-template cudf::size_type find_all_from_set<uint64_t>(const char* h_data,
-                                                     size_t h_size,
-                                                     const std::vector<char>& keys,
+template cudf::size_type find_all_from_set<uint64_t>(host_span<char const> data,
+                                                     std::vector<char> const& keys,
                                                      uint64_t result_offset,
                                                      uint64_t* positions,
                                                      rmm::cuda_stream_view stream);
 
-template cudf::size_type find_all_from_set<pos_key_pair>(const char* h_data,
-                                                         size_t h_size,
-                                                         const std::vector<char>& keys,
+template cudf::size_type find_all_from_set<pos_key_pair>(host_span<char const> data,
+                                                         std::vector<char> const& keys,
                                                          uint64_t result_offset,
                                                          pos_key_pair* positions,
                                                          rmm::cuda_stream_view stream);
 
-cudf::size_type count_all_from_set(const rmm::device_buffer& d_data,
-                                   const std::vector<char>& keys,
+cudf::size_type count_all_from_set(device_span<char const> data,
+                                   std::vector<char> const& keys,
                                    rmm::cuda_stream_view stream)
 {
-  return find_all_from_set<void>(d_data, keys, 0, nullptr, stream);
+  return find_all_from_set<void>(data, keys, 0, nullptr, stream);
 }
 
-cudf::size_type count_all_from_set(const char* h_data,
-                                   size_t h_size,
+cudf::size_type count_all_from_set(host_span<char const> data,
                                    const std::vector<char>& keys,
                                    rmm::cuda_stream_view stream)
 {
-  return find_all_from_set<void>(h_data, h_size, keys, 0, nullptr, stream);
+  return find_all_from_set<void>(data, keys, 0, nullptr, stream);
 }
 
 }  // namespace io
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index daf23de7eb2..73369e75f59 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -390,8 +390,8 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const*
  * @return cudf::size_type total number of occurrences
  */
 template <class T>
-cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
-                                  const std::vector<char>& keys,
+cudf::size_type find_all_from_set(device_span<char const> data,
+                                  std::vector<char> const& keys,
                                   uint64_t result_offset,
                                   T* positions,
                                   rmm::cuda_stream_view stream);
@@ -414,8 +414,7 @@ cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
  * @return cudf::size_type total number of occurrences
  */
 template <class T>
-cudf::size_type find_all_from_set(const char* h_data,
-                                  size_t h_size,
+cudf::size_type find_all_from_set(host_span<char const> data,
                                   const std::vector<char>& keys,
                                   uint64_t result_offset,
                                   T* positions,
@@ -431,8 +430,8 @@ cudf::size_type find_all_from_set(const char* h_data,
  *
  * @return cudf::size_type total number of occurrences
  */
-cudf::size_type count_all_from_set(const rmm::device_buffer& d_data,
-                                   const std::vector<char>& keys,
+cudf::size_type count_all_from_set(device_span<char const> data,
+                                   std::vector<char> const& keys,
                                    rmm::cuda_stream_view stream);
 
 /**
@@ -449,8 +448,7 @@ cudf::size_type count_all_from_set(const rmm::device_buffer& d_data,
  *
  * @return cudf::size_type total number of occurrences
  */
-cudf::size_type count_all_from_set(const char* h_data,
-                                   size_t h_size,
+cudf::size_type count_all_from_set(host_span<char const> data,
                                    const std::vector<char>& keys,
                                    rmm::cuda_stream_view stream);
 

From f84dcc3d12f520af6bec92950b4f494ef181a2c6 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 21:09:03 -0500
Subject: [PATCH 18/32] replace json::reader_impl column_map members with local
 variables

---
 cpp/src/hash/concurrent_unordered_map.cuh |  7 ++-
 cpp/src/io/json/json_gpu.cu               | 26 ++++++-----
 cpp/src/io/json/reader_impl.cu            | 53 ++++++++++-------------
 cpp/src/io/json/reader_impl.hpp           | 36 +++------------
 4 files changed, 51 insertions(+), 71 deletions(-)

diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index c4a9da9285d..a3f954920c8 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -538,8 +538,11 @@ class concurrent_unordered_map {
       }
     }
 
-    init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
-      m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
+    if (m_capacity > 0) {
+      init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
+        m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
+    }
+
     CUDA_TRY(cudaGetLastError());
   }
 };
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index ba6bc30e0d4..d3930daefd2 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -424,19 +424,19 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
                                                   const char* end,
                                                   parse_options_view const& opts,
                                                   cudf::size_type field_idx,
-                                                  col_map_type* col_map)
+                                                  col_map_type col_map)
 {
   auto const desc_pre_trim =
-    col_map == nullptr
+    col_map.capacity() == 0
       // No key - column and begin are trivial
       ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
       : [&]() {
           auto const key_range = get_next_key(begin, end, opts.quotechar);
           auto const key_hash  = MurmurHash3_32<cudf::string_view>{}(
             cudf::string_view(key_range.first, key_range.second - key_range.first));
-          auto const hash_col = col_map->find(key_hash);
+          auto const hash_col = col_map.find(key_hash);
           // Fall back to field index if not found (parsing error)
-          auto const column = (hash_col != col_map->end()) ? (*hash_col).second : field_idx;
+          auto const column = (hash_col != col_map.end()) ? (*hash_col).second : field_idx;
 
           // Skip the colon between the key and the value
           auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1;
@@ -491,7 +491,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
                                                device_span<char const> const data,
                                                device_span<uint64_t const> const row_offsets,
                                                device_span<data_type const> const column_types,
-                                               col_map_type* col_map,
+                                               col_map_type col_map,
                                                device_span<void* const> const output_columns,
                                                device_span<bitmask_type* const> const valid_fields,
                                                device_span<cudf::size_type> const num_valid_fields)
@@ -562,14 +562,14 @@ __global__ void detect_data_types_kernel(
   parse_options_view const opts,
   device_span<char const> const data,
   device_span<uint64_t const> const row_offsets,
-  col_map_type* col_map,
+  col_map_type col_map,
   int num_columns,
   device_span<cudf::io::column_type_histogram> const column_infos)
 {
   auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
   if (rec_id >= row_offsets.size()) return;
 
-  auto const are_rows_objects = col_map != nullptr;
+  auto const are_rows_objects = col_map.capacity() != 0;
   auto const row_data_range   = get_row_data_range(data, row_offsets, rec_id);
 
   size_type input_field_index = 0;
@@ -768,8 +768,14 @@ void convert_json_to_columns(parse_options_view const& opts,
 
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
-  convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream.value()>>>(
-    opts, data, row_offsets, column_types, col_map, output_columns, valid_fields, num_valid_fields);
+  convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream.value()>>>(opts,
+                                                                               data,
+                                                                               row_offsets,
+                                                                               column_types,
+                                                                               *col_map,
+                                                                               output_columns,
+                                                                               valid_fields,
+                                                                               num_valid_fields);
 
   CUDA_TRY(cudaGetLastError());
 }
@@ -814,7 +820,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
   detect_data_types_kernel<<<grid_size, block_size, 0, stream.value()>>>(
-    options, data, row_offsets, col_map, num_columns, d_column_infos);
+    options, data, row_offsets, *col_map, num_columns, d_column_infos);
 
   return cudf::detail::make_std_vector_sync(d_column_infos, stream);
 }
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 7166cb776c8..4a1b0dc5afc 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -88,7 +88,7 @@ std::unique_ptr<table> aggregate_keys_info(std::unique_ptr<table> info)
 col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes,
                                            rmm::cuda_stream_view stream)
 {
-  auto key_col_map{col_map_type::create(column_name_hashes.size(), stream)};
+  auto key_col_map       = col_map_type::create(column_name_hashes.size(), stream);
   auto const column_data = column_name_hashes.data<uint32_t>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
@@ -359,10 +359,11 @@ rmm::device_buffer reader_impl::upload_data_to_device(json_reader_options const&
   return rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream);
 }
 
-std::vector<std::string> reader_impl::get_column_names(parse_options_view const& parse_opts,
-                                                       device_span<uint64_t const> rec_starts,
-                                                       device_span<char const> data,
-                                                       rmm::cuda_stream_view stream)
+std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_column_names_and_map(
+  parse_options_view const& parse_opts,
+  device_span<uint64_t const> rec_starts,
+  device_span<char const> data,
+  rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
   uint64_t first_row_len = data.size() / sizeof(char);
@@ -393,9 +394,7 @@ std::vector<std::string> reader_impl::get_column_names(parse_options_view const&
   // If the first opening bracket is '{', assume object format
   if (first_curly_bracket < first_square_bracket) {
     // use keys as column names if input rows are objects
-    auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, data, stream);
-    set_column_map(std::move(keys_desc.second), stream);
-    return keys_desc.first;
+    return get_json_object_keys_hashes(parse_opts, rec_starts, data, stream);
   } else {
     int cols_found    = 0;
     bool quotation    = false;
@@ -411,7 +410,7 @@ std::vector<std::string> reader_impl::get_column_names(parse_options_view const&
         column_names.emplace_back(std::to_string(cols_found++));
       }
     }
-    return column_names;
+    return {column_names, col_map_type::create(0, stream)};
   }
 }
 
@@ -460,6 +459,7 @@ std::vector<data_type> reader_impl::parse_data_types(
 std::vector<data_type> reader_impl::get_data_types(json_reader_options const& reader_opts,
                                                    parse_options_view const& parse_opts,
                                                    std::vector<std::string> const& column_names,
+                                                   col_map_type* column_map,
                                                    device_span<uint64_t const> rec_starts,
                                                    device_span<char const> data,
                                                    rmm::cuda_stream_view stream)
@@ -490,15 +490,10 @@ std::vector<data_type> reader_impl::get_data_types(json_reader_options const& re
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = column_names.size();
-    auto const do_set_null_count = key_to_col_idx_map_ != nullptr;
+    auto const do_set_null_count = column_map->capacity() > 0;
 
-    auto const h_column_infos = cudf::io::json::gpu::detect_data_types(parse_opts,
-                                                                       data,
-                                                                       rec_starts,
-                                                                       do_set_null_count,
-                                                                       num_columns,
-                                                                       get_column_map_device_ptr(),
-                                                                       stream);
+    auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
+      parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream);
 
     auto get_type_id = [&](auto const& cinfo) {
       auto int_count_total =
@@ -539,6 +534,7 @@ std::vector<data_type> reader_impl::get_data_types(json_reader_options const& re
 table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts,
                                                        std::vector<data_type> const& dtypes,
                                                        std::vector<std::string> const& column_names,
+                                                       col_map_type* column_map,
                                                        device_span<uint64_t const> rec_starts,
                                                        device_span<char const> data,
                                                        rmm::cuda_stream_view stream,
@@ -569,15 +565,8 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const&
   auto d_valid_counts =
     cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(num_columns, stream);
 
-  cudf::io::json::gpu::convert_json_to_columns(parse_opts,
-                                               data,
-                                               rec_starts,
-                                               d_dtypes,
-                                               get_column_map_device_ptr(),
-                                               d_data,
-                                               d_valid,
-                                               d_valid_counts,
-                                               stream);
+  cudf::io::json::gpu::convert_json_to_columns(
+    parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream);
 
   stream.synchronize();
 
@@ -672,17 +661,21 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
 
   CUDF_EXPECTS(data_span.size() != 0, "Error uploading input data to the GPU.\n");
 
-  auto column_names = get_column_names(parse_opts.view(), rec_starts, data_span, stream);
+  auto column_names_and_map =
+    get_column_names_and_map(parse_opts.view(), rec_starts, data_span, stream);
+
+  auto column_names = std::get<0>(column_names_and_map);
+  auto column_map   = std::move(std::get<1>(column_names_and_map));
 
   CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n");
 
-  auto dtypes =
-    get_data_types(reader_opts, parse_opts.view(), column_names, rec_starts, data_span, stream);
+  auto dtypes = get_data_types(
+    reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, data_span, stream);
 
   CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n");
 
   return convert_data_to_table(
-    parse_opts.view(), dtypes, column_names, rec_starts, data_span, stream, mr);
+    parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, data_span, stream, mr);
 }
 
 table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index fa464a0ef8b..bf421f1604d 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -58,31 +58,6 @@ class reader_impl {
   // Used when the input data is compressed, to ensure the allocated uncompressed data is freed
   std::vector<char> uncomp_data_owner_;
 
-  // the map is only used for files with rows in object format; initialize to a dummy value so the
-  // map object can be passed to the kernel in any case
-  col_map_ptr_type key_to_col_idx_map_;
-  std::unique_ptr<rmm::device_scalar<col_map_type>> d_key_col_map_;
-
-  /**
-   * @brief Sets the column map data member and makes a device copy to be used as a kernel
-   * parameter.
-   */
-  void set_column_map(col_map_ptr_type&& map, rmm::cuda_stream_view stream)
-  {
-    key_to_col_idx_map_ = std::move(map);
-    d_key_col_map_ =
-      std::make_unique<rmm::device_scalar<col_map_type>>(*key_to_col_idx_map_, stream);
-  }
-  /**
-   * @brief Gets the pointer to the column hash map in the device memory.
-   *
-   * Returns `nullptr` if the map is not created.
-   */
-  auto get_column_map_device_ptr()
-  {
-    return key_to_col_idx_map_ ? d_key_col_map_->data() : nullptr;
-  }
-
   /**
    * @brief Ingest input JSON file/buffer, without decompression
    *
@@ -150,10 +125,11 @@ class reader_impl {
    * @param[in] rec_starts Record starts in device memory
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  std::vector<std::string> get_column_names(parse_options_view const& parse_opts,
-                                            device_span<uint64_t const> rec_starts,
-                                            device_span<char const> data,
-                                            rmm::cuda_stream_view stream);
+  std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
+    parse_options_view const& parse_opts,
+    device_span<uint64_t const> rec_starts,
+    device_span<char const> data,
+    rmm::cuda_stream_view stream);
 
   std::vector<data_type> parse_data_types(std::vector<std::string> const& column_names,
                                           std::vector<std::string> const& types_as_strings);
@@ -170,6 +146,7 @@ class reader_impl {
   std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
                                         parse_options_view const& parse_opts,
                                         std::vector<std::string> const& column_names,
+                                        col_map_type* column_map,
                                         device_span<uint64_t const> rec_starts,
                                         device_span<char const> data,
                                         rmm::cuda_stream_view stream);
@@ -186,6 +163,7 @@ class reader_impl {
   table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
                                             std::vector<data_type> const& dtypes,
                                             std::vector<std::string> const& column_names,
+                                            col_map_type* column_map,
                                             device_span<uint64_t const> rec_starts,
                                             device_span<char const> data,
                                             rmm::cuda_stream_view stream,

From e11e9dbe3ae694314548f008bd76eac0498a319c Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 21:28:33 -0500
Subject: [PATCH 19/32] change json::reader_impl host buffer type from uint8_t
 to char

---
 cpp/src/io/json/reader_impl.cu  | 15 ++++++++-------
 cpp/src/io/json/reader_impl.hpp |  4 ++--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 4a1b0dc5afc..dd5305ebfb8 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -199,7 +199,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_obje
 }
 
 void reader_impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
-                                   std::vector<uint8_t>& buffer,
+                                   std::vector<char>& buffer,
                                    size_t range_offset,
                                    size_t range_size,
                                    size_t range_size_padded)
@@ -215,8 +215,9 @@ void reader_impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> cons
   size_t bytes_read = 0;
   for (const auto& source : sources) {
     if (!source->is_empty()) {
-      auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
-      bytes_read += source->host_read(range_offset, data_size, &buffer[bytes_read]);
+      auto data_size   = (range_size_padded != 0) ? range_size_padded : source->size();
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read;
+      bytes_read += source->host_read(range_offset, data_size, destination);
     }
   }
 }
@@ -234,17 +235,17 @@ bool should_load_whole_source(json_reader_options const& reader_opts)
  * Loads the data into device memory if byte range parameters are not used
  */
 rmm::device_buffer reader_impl::decompress_input(json_reader_options const& reader_opts,
-                                                 std::vector<uint8_t> const& buffer,
+                                                 std::vector<char> const& buffer,
                                                  rmm::cuda_stream_view stream)
 {
   if (reader_opts.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
-    uncomp_data_ = reinterpret_cast<const char*>(buffer.data());
+    uncomp_data_ = buffer.data();
     uncomp_size_ = buffer.size();
   } else {
     uncomp_data_owner_ = get_uncompressed_data(  //
       host_span<char const>(                     //
-        reinterpret_cast<const char*>(buffer.data()),
+        buffer.data(),
         buffer.size()),
       reader_opts.get_compression());
 
@@ -640,7 +641,7 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   auto range_size        = reader_opts.get_byte_range_size();
   auto range_size_padded = reader_opts.get_byte_range_size_with_padding();
 
-  std::vector<uint8_t> buffer;
+  std::vector<char> buffer;
 
   ingest_raw_input(sources, buffer, range_offset, range_size, range_size_padded);
 
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index bf421f1604d..0e6a8005055 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -69,7 +69,7 @@ class reader_impl {
    * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
    */
   void ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
-                        std::vector<uint8_t>& buffer,
+                        std::vector<char>& buffer,
                         size_t range_offset,
                         size_t range_size,
                         size_t range_size_padded);
@@ -91,7 +91,7 @@ class reader_impl {
    * Sets the uncomp_data_ and uncomp_size_ data members
    */
   rmm::device_buffer decompress_input(json_reader_options const& options,
-                                      std::vector<uint8_t> const& buffer,
+                                      std::vector<char> const& buffer,
                                       rmm::cuda_stream_view stream);
 
   /**

From df102172095e36958023412bf97729bef04c2f9d Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 21:42:40 -0500
Subject: [PATCH 20/32] replace json::reader_impl uncomp data members with
 single span member

---
 cpp/src/io/json/reader_impl.cu  | 60 +++++++++++++++------------------
 cpp/src/io/json/reader_impl.hpp | 15 ++++-----
 2 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index dd5305ebfb8..9e01e9f8405 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -194,7 +194,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_obje
   auto aggregated_info = aggregate_keys_info(std::move(info));
   auto sorted_info     = sort_keys_info_by_offset(std::move(aggregated_info));
 
-  return {create_key_strings(uncomp_data_, sorted_info->view(), stream),
+  return {create_key_strings(uncomp_data_.data(), sorted_info->view(), stream),
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
@@ -234,14 +234,13 @@ bool should_load_whole_source(json_reader_options const& reader_opts)
  * Sets the uncomp_data_ and uncomp_size_ data members
  * Loads the data into device memory if byte range parameters are not used
  */
-rmm::device_buffer reader_impl::decompress_input(json_reader_options const& reader_opts,
-                                                 std::vector<char> const& buffer,
-                                                 rmm::cuda_stream_view stream)
+rmm::device_uvector<char> reader_impl::decompress_input(json_reader_options const& reader_opts,
+                                                        std::vector<char> const& buffer,
+                                                        rmm::cuda_stream_view stream)
 {
   if (reader_opts.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
-    uncomp_data_ = buffer.data();
-    uncomp_size_ = buffer.size();
+    uncomp_data_ = host_span<char const>(static_cast<char const*>(buffer.data()), buffer.size());
   } else {
     uncomp_data_owner_ = get_uncompressed_data(  //
       host_span<char const>(                     //
@@ -249,13 +248,12 @@ rmm::device_buffer reader_impl::decompress_input(json_reader_options const& read
         buffer.size()),
       reader_opts.get_compression());
 
-    uncomp_data_ = uncomp_data_owner_.data();
-    uncomp_size_ = uncomp_data_owner_.size();
+    uncomp_data_ = host_span<char const>(uncomp_data_owner_.data(), uncomp_data_owner_.size());
   }
   if (should_load_whole_source(reader_opts)) {
-    return rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+    return cudf::detail::make_device_uvector_async(uncomp_data_, stream);
   } else {
-    return {};
+    return rmm::device_uvector<char>(0, stream);
   }
 }
 
@@ -273,8 +271,7 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
   if (should_load_whole_source(reader_opts)) {
     prefilter_count += count_all_from_set(data, chars_to_count, stream);
   } else {
-    prefilter_count +=
-      count_all_from_set(host_span<char const>(uncomp_data_, uncomp_size_), chars_to_count, stream);
+    prefilter_count += count_all_from_set(uncomp_data_, chars_to_count, stream);
   }
 
   rmm::device_uvector<uint64_t> rec_starts(prefilter_count, stream);
@@ -292,7 +289,7 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
     find_all_from_set(data, chars_to_find, 1, find_result_ptr, stream);
   } else {
     find_all_from_set(  //
-      host_span<char const>(uncomp_data_, uncomp_size_),
+      uncomp_data_,
       chars_to_find,
       1,
       find_result_ptr,
@@ -307,7 +304,7 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
   auto filtered_count = prefilter_count;
 
   // Exclude the ending newline as it does not precede a record start
-  if (uncomp_data_[uncomp_size_ - 1] == '\n') { filtered_count--; }
+  if (uncomp_data_.back() == '\n') { filtered_count--; }
   rec_starts.resize(filtered_count, stream);
 
   return rec_starts;
@@ -320,12 +317,13 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
  * Only rows that need to be parsed are copied, based on the byte range
  * Also updates the array of record starts to match the device data offset.
  */
-rmm::device_buffer reader_impl::upload_data_to_device(json_reader_options const& reader_opts,
-                                                      rmm::device_uvector<uint64_t>& rec_starts,
-                                                      rmm::cuda_stream_view stream)
+rmm::device_uvector<char> reader_impl::upload_data_to_device(
+  json_reader_options const& reader_opts,
+  rmm::device_uvector<uint64_t>& rec_starts,
+  rmm::cuda_stream_view stream)
 {
   size_t start_offset = 0;
-  size_t end_offset   = uncomp_size_;
+  size_t end_offset   = uncomp_data_.size();
 
   // Trim lines that are outside range
   if (reader_opts.get_byte_range_size() != 0 || reader_opts.get_byte_range_offset() != 0) {
@@ -353,11 +351,12 @@ rmm::device_buffer reader_impl::upload_data_to_device(json_reader_options const&
   }
 
   const size_t bytes_to_upload = end_offset - start_offset;
-  CUDF_EXPECTS(bytes_to_upload <= uncomp_size_,
+  CUDF_EXPECTS(bytes_to_upload <= uncomp_data_.size(),
                "Error finding the record within the specified byte range.\n");
 
   // Upload the raw data that is within the rows of interest
-  return rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream);
+  return cudf::detail::make_device_uvector_async(
+    uncomp_data_.subspan(start_offset, bytes_to_upload), stream);
 }
 
 std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_column_names_and_map(
@@ -647,23 +646,20 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
 
   CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n");
 
-  auto data      = decompress_input(reader_opts, buffer, stream);
-  auto data_span = device_span<char>(static_cast<char*>(data.data()), data.size());
+  auto data = decompress_input(reader_opts, buffer, stream);
 
-  CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n");
-  CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n");
+  CUDF_EXPECTS(uncomp_data_.data() != nullptr, "Ingest failed: uncompressed input data is null.\n");
+  CUDF_EXPECTS(uncomp_data_.size() != 0, "Ingest failed: uncompressed input data has zero size.\n");
 
-  auto rec_starts = find_record_starts(reader_opts, data_span, stream);
+  auto rec_starts = find_record_starts(reader_opts, data, stream);
 
   CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n");
 
-  data      = upload_data_to_device(reader_opts, rec_starts, stream);
-  data_span = device_span<char>(static_cast<char*>(data.data()), data.size());
+  data = upload_data_to_device(reader_opts, rec_starts, stream);
 
-  CUDF_EXPECTS(data_span.size() != 0, "Error uploading input data to the GPU.\n");
+  CUDF_EXPECTS(data.size() != 0, "Error uploading input data to the GPU.\n");
 
-  auto column_names_and_map =
-    get_column_names_and_map(parse_opts.view(), rec_starts, data_span, stream);
+  auto column_names_and_map = get_column_names_and_map(parse_opts.view(), rec_starts, data, stream);
 
   auto column_names = std::get<0>(column_names_and_map);
   auto column_map   = std::move(std::get<1>(column_names_and_map));
@@ -671,12 +667,12 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n");
 
   auto dtypes = get_data_types(
-    reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, data_span, stream);
+    reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, data, stream);
 
   CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n");
 
   return convert_data_to_table(
-    parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, data_span, stream, mr);
+    parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, data, stream, mr);
 }
 
 table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 0e6a8005055..79d70b2e121 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -52,8 +52,7 @@ using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_ma
 class reader_impl {
  public:
  private:
-  const char* uncomp_data_ = nullptr;
-  size_t uncomp_size_      = 0;
+  host_span<char const> uncomp_data_;
 
   // Used when the input data is compressed, to ensure the allocated uncompressed data is freed
   std::vector<char> uncomp_data_owner_;
@@ -90,9 +89,9 @@ class reader_impl {
    *
    * Sets the uncomp_data_ and uncomp_size_ data members
    */
-  rmm::device_buffer decompress_input(json_reader_options const& options,
-                                      std::vector<char> const& buffer,
-                                      rmm::cuda_stream_view stream);
+  rmm::device_uvector<char> decompress_input(json_reader_options const& options,
+                                             std::vector<char> const& buffer,
+                                             rmm::cuda_stream_view stream);
 
   /**
    * @brief Finds all record starts in the file.
@@ -113,9 +112,9 @@ class reader_impl {
    * Only rows that need to be parsed are copied, based on the byte range
    * Also updates the array of record starts to match the device data offset.
    */
-  rmm::device_buffer upload_data_to_device(json_reader_options const& reader_opts,
-                                           rmm::device_uvector<uint64_t>& rec_starts,
-                                           rmm::cuda_stream_view stream);
+  rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reader_opts,
+                                                  rmm::device_uvector<uint64_t>& rec_starts,
+                                                  rmm::cuda_stream_view stream);
 
   /**
    * @brief Parse the first row to set the column name

From 0417be815745137ad10605ccdabea33f9c4c481a Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 22:06:19 -0500
Subject: [PATCH 21/32] json::reader_impl simplify device data copy logic

---
 cpp/src/io/json/reader_impl.cu  | 68 ++++++++++++++++-----------------
 cpp/src/io/json/reader_impl.hpp |  6 +--
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 9e01e9f8405..94d9a1b6574 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -234,9 +234,9 @@ bool should_load_whole_source(json_reader_options const& reader_opts)
  * Sets the uncomp_data_ and uncomp_size_ data members
  * Loads the data into device memory if byte range parameters are not used
  */
-rmm::device_uvector<char> reader_impl::decompress_input(json_reader_options const& reader_opts,
-                                                        std::vector<char> const& buffer,
-                                                        rmm::cuda_stream_view stream)
+void reader_impl::decompress_input(json_reader_options const& reader_opts,
+                                   std::vector<char> const& buffer,
+                                   rmm::cuda_stream_view stream)
 {
   if (reader_opts.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
@@ -250,11 +250,6 @@ rmm::device_uvector<char> reader_impl::decompress_input(json_reader_options cons
 
     uncomp_data_ = host_span<char const>(uncomp_data_owner_.data(), uncomp_data_owner_.size());
   }
-  if (should_load_whole_source(reader_opts)) {
-    return cudf::detail::make_device_uvector_async(uncomp_data_, stream);
-  } else {
-    return rmm::device_uvector<char>(0, stream);
-  }
 }
 
 rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
@@ -322,34 +317,31 @@ rmm::device_uvector<char> reader_impl::upload_data_to_device(
   rmm::device_uvector<uint64_t>& rec_starts,
   rmm::cuda_stream_view stream)
 {
-  size_t start_offset = 0;
-  size_t end_offset   = uncomp_data_.size();
+  size_t end_offset = uncomp_data_.size();
 
   // Trim lines that are outside range
-  if (reader_opts.get_byte_range_size() != 0 || reader_opts.get_byte_range_offset() != 0) {
-    auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
-
-    if (reader_opts.get_byte_range_size() != 0) {
-      auto it = h_rec_starts.end() - 1;
-      while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) {
-        end_offset = *it;
-        --it;
-      }
-      h_rec_starts.erase(it + 1, h_rec_starts.end());
-    }
+  auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
 
-    // Resize to exclude rows outside of the range
-    // Adjust row start positions to account for the data subcopy
-    start_offset = h_rec_starts.front();
-    rec_starts.resize(h_rec_starts.size(), stream);
-    thrust::transform(rmm::exec_policy(stream),
-                      rec_starts.begin(),
-                      rec_starts.end(),
-                      thrust::make_constant_iterator(start_offset),
-                      rec_starts.begin(),
-                      thrust::minus<uint64_t>());
+  if (reader_opts.get_byte_range_size() != 0) {
+    auto it = h_rec_starts.end() - 1;
+    while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) {
+      end_offset = *it;
+      --it;
+    }
+    h_rec_starts.erase(it + 1, h_rec_starts.end());
   }
 
+  // Resize to exclude rows outside of the range
+  // Adjust row start positions to account for the data subcopy
+  size_t start_offset = h_rec_starts.front();
+  rec_starts.resize(h_rec_starts.size(), stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    rec_starts.begin(),
+                    rec_starts.end(),
+                    thrust::make_constant_iterator(start_offset),
+                    rec_starts.begin(),
+                    thrust::minus<uint64_t>());
+
   const size_t bytes_to_upload = end_offset - start_offset;
   CUDF_EXPECTS(bytes_to_upload <= uncomp_data_.size(),
                "Error finding the record within the specified byte range.\n");
@@ -646,16 +638,24 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
 
   CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n");
 
-  auto data = decompress_input(reader_opts, buffer, stream);
+  decompress_input(reader_opts, buffer, stream);
 
   CUDF_EXPECTS(uncomp_data_.data() != nullptr, "Ingest failed: uncompressed input data is null.\n");
   CUDF_EXPECTS(uncomp_data_.size() != 0, "Ingest failed: uncompressed input data has zero size.\n");
 
+  auto data = rmm::device_uvector<char>(0, stream);
+
+  if (should_load_whole_source(reader_opts)) {
+    data = cudf::detail::make_device_uvector_async(uncomp_data_, stream);
+  }
+
   auto rec_starts = find_record_starts(reader_opts, data, stream);
 
-  CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n");
+  CUDF_EXPECTS(rec_starts.size() > 0, "Error enumerating records.\n");
 
-  data = upload_data_to_device(reader_opts, rec_starts, stream);
+  if (not should_load_whole_source(reader_opts)) {
+    data = upload_data_to_device(reader_opts, rec_starts, stream);
+  }
 
   CUDF_EXPECTS(data.size() != 0, "Error uploading input data to the GPU.\n");
 
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 79d70b2e121..807cc98e751 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -89,9 +89,9 @@ class reader_impl {
    *
    * Sets the uncomp_data_ and uncomp_size_ data members
    */
-  rmm::device_uvector<char> decompress_input(json_reader_options const& options,
-                                             std::vector<char> const& buffer,
-                                             rmm::cuda_stream_view stream);
+  void decompress_input(json_reader_options const& options,
+                        std::vector<char> const& buffer,
+                        rmm::cuda_stream_view stream);
 
   /**
    * @brief Finds all record starts in the file.

From f0fd5c132cffca4991a367c4dcb9651446ce3b36 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 22:37:48 -0500
Subject: [PATCH 22/32] remove json::reader_impl::decompress_input function and
 inline the logic

---
 cpp/src/io/json/reader_impl.cu  | 51 +++++++++++----------------------
 cpp/src/io/json/reader_impl.hpp | 21 +++-----------
 2 files changed, 21 insertions(+), 51 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 94d9a1b6574..d1da78d018f 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -198,11 +198,11 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_obje
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
-void reader_impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
-                                   std::vector<char>& buffer,
-                                   size_t range_offset,
-                                   size_t range_size,
-                                   size_t range_size_padded)
+std::vector<char> reader_impl::ingest_raw_input(
+  std::vector<std::unique_ptr<datasource>> const& sources,
+  size_t range_offset,
+  size_t range_size,
+  size_t range_size_padded)
 {
   // Iterate through the user defined sources and read the contents into the local buffer
   size_t total_source_size = 0;
@@ -211,7 +211,8 @@ void reader_impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> cons
   }
   total_source_size = total_source_size - (range_offset * sources.size());
 
-  buffer.resize(total_source_size);
+  auto buffer = std::vector<char>(total_source_size);
+
   size_t bytes_read = 0;
   for (const auto& source : sources) {
     if (!source->is_empty()) {
@@ -220,6 +221,8 @@ void reader_impl::ingest_raw_input(std::vector<std::unique_ptr<datasource>> cons
       bytes_read += source->host_read(range_offset, data_size, destination);
     }
   }
+
+  return buffer;
 }
 
 bool should_load_whole_source(json_reader_options const& reader_opts)
@@ -228,30 +231,6 @@ bool should_load_whole_source(json_reader_options const& reader_opts)
          reader_opts.get_byte_range_size() == 0;
 }
 
-/**
- * @brief Decompress the input data, if needed
- *
- * Sets the uncomp_data_ and uncomp_size_ data members
- * Loads the data into device memory if byte range parameters are not used
- */
-void reader_impl::decompress_input(json_reader_options const& reader_opts,
-                                   std::vector<char> const& buffer,
-                                   rmm::cuda_stream_view stream)
-{
-  if (reader_opts.get_compression() == compression_type::NONE) {
-    // Do not use the owner vector here to avoid extra copy
-    uncomp_data_ = host_span<char const>(static_cast<char const*>(buffer.data()), buffer.size());
-  } else {
-    uncomp_data_owner_ = get_uncompressed_data(  //
-      host_span<char const>(                     //
-        buffer.data(),
-        buffer.size()),
-      reader_opts.get_compression());
-
-    uncomp_data_ = host_span<char const>(uncomp_data_owner_.data(), uncomp_data_owner_.size());
-  }
-}
-
 rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
   json_reader_options const& reader_opts,
   device_span<char const> data,
@@ -632,13 +611,17 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   auto range_size        = reader_opts.get_byte_range_size();
   auto range_size_padded = reader_opts.get_byte_range_size_with_padding();
 
-  std::vector<char> buffer;
-
-  ingest_raw_input(sources, buffer, range_offset, range_size, range_size_padded);
+  auto buffer = ingest_raw_input(sources, range_offset, range_size, range_size_padded);
 
   CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n");
 
-  decompress_input(reader_opts, buffer, stream);
+  if (reader_opts.get_compression() != compression_type::NONE) {
+    buffer = get_uncompressed_data(  //
+      host_span<char const>(buffer.data(), buffer.size()),
+      reader_opts.get_compression());
+  }
+
+  uncomp_data_ = host_span<char const>(static_cast<char const*>(buffer.data()), buffer.size());
 
   CUDF_EXPECTS(uncomp_data_.data() != nullptr, "Ingest failed: uncompressed input data is null.\n");
   CUDF_EXPECTS(uncomp_data_.size() != 0, "Ingest failed: uncompressed input data has zero size.\n");
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 807cc98e751..98fceb78931 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -54,9 +54,6 @@ class reader_impl {
  private:
   host_span<char const> uncomp_data_;
 
-  // Used when the input data is compressed, to ensure the allocated uncompressed data is freed
-  std::vector<char> uncomp_data_owner_;
-
   /**
    * @brief Ingest input JSON file/buffer, without decompression
    *
@@ -67,11 +64,10 @@ class reader_impl {
    * @param[in] range_size Bytes to read; use `0` for all remaining data
    * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
    */
-  void ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
-                        std::vector<char>& buffer,
-                        size_t range_offset,
-                        size_t range_size,
-                        size_t range_size_padded);
+  std::vector<char> ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
+                                     size_t range_offset,
+                                     size_t range_size,
+                                     size_t range_size_padded);
 
   /**
    * @brief Extract the JSON objects keys from the input file with object rows.
@@ -84,15 +80,6 @@ class reader_impl {
     device_span<char const> data,
     rmm::cuda_stream_view stream);
 
-  /**
-   * @brief Decompress the input data, if needed
-   *
-   * Sets the uncomp_data_ and uncomp_size_ data members
-   */
-  void decompress_input(json_reader_options const& options,
-                        std::vector<char> const& buffer,
-                        rmm::cuda_stream_view stream);
-
   /**
    * @brief Finds all record starts in the file.
    *

From 45714bb0b392849a81d1001ab16c309385c51901 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 22:49:38 -0500
Subject: [PATCH 23/32] replace json::reader_impl::uncomp_data_ member with
 local variable

---
 cpp/src/io/json/reader_impl.cu  | 74 ++++++++++++++++-----------------
 cpp/src/io/json/reader_impl.hpp | 13 +++---
 2 files changed, 43 insertions(+), 44 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index d1da78d018f..88432d72f3f 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -185,16 +185,17 @@ auto sort_keys_info_by_offset(std::unique_ptr<table> info)
  */
 std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_object_keys_hashes(
   parse_options_view const& parse_opts,
+  host_span<char const> h_data,
   device_span<uint64_t const> rec_starts,
-  device_span<char const> data,
+  device_span<char const> d_data,
   rmm::cuda_stream_view stream)
 {
-  auto info = create_json_keys_info_table(parse_opts, data, rec_starts, stream);
+  auto info = create_json_keys_info_table(parse_opts, d_data, rec_starts, stream);
 
   auto aggregated_info = aggregate_keys_info(std::move(info));
   auto sorted_info     = sort_keys_info_by_offset(std::move(aggregated_info));
 
-  return {create_key_strings(uncomp_data_.data(), sorted_info->view(), stream),
+  return {create_key_strings(h_data.data(), sorted_info->view(), stream),
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
@@ -233,7 +234,8 @@ bool should_load_whole_source(json_reader_options const& reader_opts)
 
 rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
   json_reader_options const& reader_opts,
-  device_span<char const> data,
+  host_span<char const> h_data,
+  device_span<char const> d_data,
   rmm::cuda_stream_view stream)
 {
   std::vector<char> chars_to_count{'\n'};
@@ -243,9 +245,9 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
   // If not starting at an offset, add an extra row to account for the first row in the file
   cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0);
   if (should_load_whole_source(reader_opts)) {
-    prefilter_count += count_all_from_set(data, chars_to_count, stream);
+    prefilter_count += count_all_from_set(d_data, chars_to_count, stream);
   } else {
-    prefilter_count += count_all_from_set(uncomp_data_, chars_to_count, stream);
+    prefilter_count += count_all_from_set(h_data, chars_to_count, stream);
   }
 
   rmm::device_uvector<uint64_t> rec_starts(prefilter_count, stream);
@@ -260,14 +262,9 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
   std::vector<char> chars_to_find{'\n'};
   // Passing offset = 1 to return positions AFTER the found character
   if (should_load_whole_source(reader_opts)) {
-    find_all_from_set(data, chars_to_find, 1, find_result_ptr, stream);
+    find_all_from_set(d_data, chars_to_find, 1, find_result_ptr, stream);
   } else {
-    find_all_from_set(  //
-      uncomp_data_,
-      chars_to_find,
-      1,
-      find_result_ptr,
-      stream);
+    find_all_from_set(h_data, chars_to_find, 1, find_result_ptr, stream);
   }
 
   // Previous call stores the record pinput_file.typeositions as encountered by all threads
@@ -278,7 +275,7 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
   auto filtered_count = prefilter_count;
 
   // Exclude the ending newline as it does not precede a record start
-  if (uncomp_data_.back() == '\n') { filtered_count--; }
+  if (h_data.back() == '\n') { filtered_count--; }
   rec_starts.resize(filtered_count, stream);
 
   return rec_starts;
@@ -293,10 +290,11 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
  */
 rmm::device_uvector<char> reader_impl::upload_data_to_device(
   json_reader_options const& reader_opts,
+  host_span<char const> h_data,
   rmm::device_uvector<uint64_t>& rec_starts,
   rmm::cuda_stream_view stream)
 {
-  size_t end_offset = uncomp_data_.size();
+  size_t end_offset = h_data.size();
 
   // Trim lines that are outside range
   auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
@@ -322,22 +320,23 @@ rmm::device_uvector<char> reader_impl::upload_data_to_device(
                     thrust::minus<uint64_t>());
 
   const size_t bytes_to_upload = end_offset - start_offset;
-  CUDF_EXPECTS(bytes_to_upload <= uncomp_data_.size(),
+  CUDF_EXPECTS(bytes_to_upload <= h_data.size(),
                "Error finding the record within the specified byte range.\n");
 
   // Upload the raw data that is within the rows of interest
-  return cudf::detail::make_device_uvector_async(
-    uncomp_data_.subspan(start_offset, bytes_to_upload), stream);
+  return cudf::detail::make_device_uvector_async(h_data.subspan(start_offset, bytes_to_upload),
+                                                 stream);
 }
 
 std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_column_names_and_map(
   parse_options_view const& parse_opts,
+  host_span<char const> h_data,
   device_span<uint64_t const> rec_starts,
-  device_span<char const> data,
+  device_span<char const> d_data,
   rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
-  uint64_t first_row_len = data.size() / sizeof(char);
+  uint64_t first_row_len = d_data.size() / sizeof(char);
   if (rec_starts.size() > 1) {
     // Set first_row_len to the offset of the second row, if it exists
     CUDA_TRY(cudaMemcpyAsync(&first_row_len,
@@ -348,7 +347,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_column_na
   }
   std::vector<char> first_row(first_row_len);
   CUDA_TRY(cudaMemcpyAsync(first_row.data(),
-                           data.data(),
+                           d_data.data(),
                            first_row_len * sizeof(char),
                            cudaMemcpyDeviceToHost,
                            stream.value()));
@@ -365,7 +364,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_column_na
   // If the first opening bracket is '{', assume object format
   if (first_curly_bracket < first_square_bracket) {
     // use keys as column names if input rows are objects
-    return get_json_object_keys_hashes(parse_opts, rec_starts, data, stream);
+    return get_json_object_keys_hashes(parse_opts, h_data, rec_starts, d_data, stream);
   } else {
     int cols_found    = 0;
     bool quotation    = false;
@@ -611,38 +610,37 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   auto range_size        = reader_opts.get_byte_range_size();
   auto range_size_padded = reader_opts.get_byte_range_size_with_padding();
 
-  auto buffer = ingest_raw_input(sources, range_offset, range_size, range_size_padded);
+  auto h_data = ingest_raw_input(sources, range_offset, range_size, range_size_padded);
 
-  CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n");
+  CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: input data is null.\n");
 
   if (reader_opts.get_compression() != compression_type::NONE) {
-    buffer = get_uncompressed_data(  //
-      host_span<char const>(buffer.data(), buffer.size()),
+    h_data = get_uncompressed_data(  //
+      host_span<char const>(h_data.data(), h_data.size()),
       reader_opts.get_compression());
   }
 
-  uncomp_data_ = host_span<char const>(static_cast<char const*>(buffer.data()), buffer.size());
+  CUDF_EXPECTS(h_data.data() != nullptr, "Ingest failed: uncompressed input data is null.\n");
+  CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: uncompressed input data has zero size.\n");
 
-  CUDF_EXPECTS(uncomp_data_.data() != nullptr, "Ingest failed: uncompressed input data is null.\n");
-  CUDF_EXPECTS(uncomp_data_.size() != 0, "Ingest failed: uncompressed input data has zero size.\n");
-
-  auto data = rmm::device_uvector<char>(0, stream);
+  auto d_data = rmm::device_uvector<char>(0, stream);
 
   if (should_load_whole_source(reader_opts)) {
-    data = cudf::detail::make_device_uvector_async(uncomp_data_, stream);
+    d_data = cudf::detail::make_device_uvector_async(h_data, stream);
   }
 
-  auto rec_starts = find_record_starts(reader_opts, data, stream);
+  auto rec_starts = find_record_starts(reader_opts, h_data, d_data, stream);
 
   CUDF_EXPECTS(rec_starts.size() > 0, "Error enumerating records.\n");
 
   if (not should_load_whole_source(reader_opts)) {
-    data = upload_data_to_device(reader_opts, rec_starts, stream);
+    d_data = upload_data_to_device(reader_opts, h_data, rec_starts, stream);
   }
 
-  CUDF_EXPECTS(data.size() != 0, "Error uploading input data to the GPU.\n");
+  CUDF_EXPECTS(d_data.size() != 0, "Error uploading input data to the GPU.\n");
 
-  auto column_names_and_map = get_column_names_and_map(parse_opts.view(), rec_starts, data, stream);
+  auto column_names_and_map =
+    get_column_names_and_map(parse_opts.view(), h_data, rec_starts, d_data, stream);
 
   auto column_names = std::get<0>(column_names_and_map);
   auto column_map   = std::move(std::get<1>(column_names_and_map));
@@ -650,12 +648,12 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n");
 
   auto dtypes = get_data_types(
-    reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, data, stream);
+    reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, d_data, stream);
 
   CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n");
 
   return convert_data_to_table(
-    parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, data, stream, mr);
+    parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, d_data, stream, mr);
 }
 
 table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 98fceb78931..aace05403e5 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -50,10 +50,7 @@ using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_ma
  * @brief Class used to parse Json input and convert it into gdf columns.
  */
 class reader_impl {
- public:
  private:
-  host_span<char const> uncomp_data_;
-
   /**
    * @brief Ingest input JSON file/buffer, without decompression
    *
@@ -76,8 +73,9 @@ class reader_impl {
    */
   std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
     parse_options_view const& parse_opts,
+    host_span<char const> h_data,
     device_span<uint64_t const> rec_starts,
-    device_span<char const> data,
+    device_span<char const> d_data,
     rmm::cuda_stream_view stream);
 
   /**
@@ -89,7 +87,8 @@ class reader_impl {
    * @return Record starts in the device memory
    */
   rmm::device_uvector<uint64_t> find_record_starts(json_reader_options const& reader_opts,
-                                                   device_span<char const> data,
+                                                   host_span<char const> h_data,
+                                                   device_span<char const> d_data,
                                                    rmm::cuda_stream_view stream);
 
   /**
@@ -100,6 +99,7 @@ class reader_impl {
    * Also updates the array of record starts to match the device data offset.
    */
   rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reader_opts,
+                                                  host_span<char const> h_data,
                                                   rmm::device_uvector<uint64_t>& rec_starts,
                                                   rmm::cuda_stream_view stream);
 
@@ -113,8 +113,9 @@ class reader_impl {
    */
   std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
     parse_options_view const& parse_opts,
+    host_span<char const> h_data,
     device_span<uint64_t const> rec_starts,
-    device_span<char const> data,
+    device_span<char const> d_data,
     rmm::cuda_stream_view stream);
 
   std::vector<data_type> parse_data_types(std::vector<std::string> const& column_names,

From 8d18ffe4e8bdc7965f6c3bf92efead93ff5b98c9 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 23:03:04 -0500
Subject: [PATCH 24/32] relocate json::reader_impl decompression code

---
 cpp/src/io/json/reader_impl.cu  | 19 ++++++++-----------
 cpp/src/io/json/reader_impl.hpp |  1 +
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 88432d72f3f..0f2034d15ab 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -201,6 +201,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_obje
 
 std::vector<char> reader_impl::ingest_raw_input(
   std::vector<std::unique_ptr<datasource>> const& sources,
+  compression_type compression,
   size_t range_offset,
   size_t range_size,
   size_t range_size_padded)
@@ -223,7 +224,11 @@ std::vector<char> reader_impl::ingest_raw_input(
     }
   }
 
-  return buffer;
+  if (compression == compression_type::NONE) {
+    return buffer;
+  } else {
+    return get_uncompressed_data(buffer, compression);
+  }
 }
 
 bool should_load_whole_source(json_reader_options const& reader_opts)
@@ -610,17 +615,9 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
   auto range_size        = reader_opts.get_byte_range_size();
   auto range_size_padded = reader_opts.get_byte_range_size_with_padding();
 
-  auto h_data = ingest_raw_input(sources, range_offset, range_size, range_size_padded);
-
-  CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: input data is null.\n");
-
-  if (reader_opts.get_compression() != compression_type::NONE) {
-    h_data = get_uncompressed_data(  //
-      host_span<char const>(h_data.data(), h_data.size()),
-      reader_opts.get_compression());
-  }
+  auto h_data = ingest_raw_input(
+    sources, reader_opts.get_compression(), range_offset, range_size, range_size_padded);
 
-  CUDF_EXPECTS(h_data.data() != nullptr, "Ingest failed: uncompressed input data is null.\n");
   CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: uncompressed input data has zero size.\n");
 
   auto d_data = rmm::device_uvector<char>(0, stream);
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index aace05403e5..22c016c6613 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -62,6 +62,7 @@ class reader_impl {
    * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
    */
   std::vector<char> ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
+                                     compression_type compression,
                                      size_t range_offset,
                                      size_t range_size,
                                      size_t range_size_padded);

From 06a39b7f2fdb733d2e83443c94dd40c3d932f610 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 23:13:11 -0500
Subject: [PATCH 25/32] remove unneccessary json::reader_impl class

---
 cpp/src/io/json/reader_impl.cu  | 105 ++++++++++---------
 cpp/src/io/json/reader_impl.hpp | 180 --------------------------------
 2 files changed, 53 insertions(+), 232 deletions(-)
 delete mode 100644 cpp/src/io/json/reader_impl.hpp

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 0f2034d15ab..745bfd40888 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -19,25 +19,33 @@
  * @brief cuDF-IO JSON reader class implementation
  */
 
-#include "reader_impl.hpp"
+#include "json_common.h"
+#include "json_gpu.h"
+
+#include <hash/concurrent_unordered_map.cuh>
 
 #include <io/comp/io_uncomp.h>
+#include <io/utilities/column_buffer.hpp>
 #include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/trie.cuh>
 #include <io/utilities/type_conversion.cuh>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/detail/json.hpp>
+#include <cudf/io/json.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
-#include <io/utilities/trie.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/optional.h>
@@ -48,7 +56,12 @@ namespace cudf {
 namespace io {
 namespace detail {
 namespace json {
+
 using namespace cudf::io;
+using namespace cudf::io::json;
+
+using col_map_type     = cudf::io::json::gpu::col_map_type;
+using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;
 
 /**
  * @brief Aggregate the table containing keys info by their hash values.
@@ -183,7 +196,7 @@ auto sort_keys_info_by_offset(std::unique_ptr<table> info)
  *
  * @return Names of JSON object keys in the file
  */
-std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_object_keys_hashes(
+std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
   parse_options_view const& parse_opts,
   host_span<char const> h_data,
   device_span<uint64_t const> rec_starts,
@@ -199,12 +212,11 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_json_obje
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
-std::vector<char> reader_impl::ingest_raw_input(
-  std::vector<std::unique_ptr<datasource>> const& sources,
-  compression_type compression,
-  size_t range_offset,
-  size_t range_size,
-  size_t range_size_padded)
+std::vector<char> ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   size_t range_size_padded)
 {
   // Iterate through the user defined sources and read the contents into the local buffer
   size_t total_source_size = 0;
@@ -237,11 +249,10 @@ bool should_load_whole_source(json_reader_options const& reader_opts)
          reader_opts.get_byte_range_size() == 0;
 }
 
-rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
-  json_reader_options const& reader_opts,
-  host_span<char const> h_data,
-  device_span<char const> d_data,
-  rmm::cuda_stream_view stream)
+rmm::device_uvector<uint64_t> find_record_starts(json_reader_options const& reader_opts,
+                                                 host_span<char const> h_data,
+                                                 device_span<char const> d_data,
+                                                 rmm::cuda_stream_view stream)
 {
   std::vector<char> chars_to_count{'\n'};
   // Currently, ignoring lineterminations within quotes is handled by recording the records of both,
@@ -293,11 +304,10 @@ rmm::device_uvector<uint64_t> reader_impl::find_record_starts(
  * Only rows that need to be parsed are copied, based on the byte range
  * Also updates the array of record starts to match the device data offset.
  */
-rmm::device_uvector<char> reader_impl::upload_data_to_device(
-  json_reader_options const& reader_opts,
-  host_span<char const> h_data,
-  rmm::device_uvector<uint64_t>& rec_starts,
-  rmm::cuda_stream_view stream)
+rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reader_opts,
+                                                host_span<char const> h_data,
+                                                rmm::device_uvector<uint64_t>& rec_starts,
+                                                rmm::cuda_stream_view stream)
 {
   size_t end_offset = h_data.size();
 
@@ -333,7 +343,7 @@ rmm::device_uvector<char> reader_impl::upload_data_to_device(
                                                  stream);
 }
 
-std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_column_names_and_map(
+std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
   parse_options_view const& parse_opts,
   host_span<char const> h_data,
   device_span<uint64_t const> rec_starts,
@@ -389,8 +399,8 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader_impl::get_column_na
   }
 }
 
-std::vector<data_type> reader_impl::parse_data_types(
-  std::vector<std::string> const& column_names, std::vector<std::string> const& types_as_strings)
+std::vector<data_type> parse_data_types(std::vector<std::string> const& column_names,
+                                        std::vector<std::string> const& types_as_strings)
 {
   CUDF_EXPECTS(types_as_strings.size() == column_names.size(),
                "Need to specify the type of each column.\n");
@@ -431,13 +441,13 @@ std::vector<data_type> reader_impl::parse_data_types(
   return dtypes;
 }
 
-std::vector<data_type> reader_impl::get_data_types(json_reader_options const& reader_opts,
-                                                   parse_options_view const& parse_opts,
-                                                   std::vector<std::string> const& column_names,
-                                                   col_map_type* column_map,
-                                                   device_span<uint64_t const> rec_starts,
-                                                   device_span<char const> data,
-                                                   rmm::cuda_stream_view stream)
+std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
+                                      parse_options_view const& parse_opts,
+                                      std::vector<std::string> const& column_names,
+                                      col_map_type* column_map,
+                                      device_span<uint64_t const> rec_starts,
+                                      device_span<char const> data,
+                                      rmm::cuda_stream_view stream)
 {
   bool has_to_infer_column_types =
     std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
@@ -506,14 +516,14 @@ std::vector<data_type> reader_impl::get_data_types(json_reader_options const& re
   }
 }
 
-table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts,
-                                                       std::vector<data_type> const& dtypes,
-                                                       std::vector<std::string> const& column_names,
-                                                       col_map_type* column_map,
-                                                       device_span<uint64_t const> rec_starts,
-                                                       device_span<char const> data,
-                                                       rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
+                                          std::vector<data_type> const& dtypes,
+                                          std::vector<std::string> const& column_names,
+                                          col_map_type* column_map,
+                                          device_span<uint64_t const> rec_starts,
+                                          device_span<char const> data,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
   const auto num_columns = dtypes.size();
   const auto num_records = rec_starts.size();
@@ -596,11 +606,13 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const&
  *
  * @return Table and its metadata
  */
-table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>& sources,
-                                      json_reader_options const& reader_opts,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
+                              json_reader_options const& reader_opts,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(not sources.empty(), "No sources were defined");
+
   CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
   auto parse_opts = parse_options{',', '\n', '\"', '.'};
@@ -653,17 +665,6 @@ table_with_metadata reader_impl::read(std::vector<std::unique_ptr<datasource>>&
     parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, d_data, stream, mr);
 }
 
-table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
-                              json_reader_options const& options,
-                              rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(not sources.empty(), "No sources were defined");
-
-  auto impl = std::make_unique<reader_impl>();
-
-  return table_with_metadata{impl->read(sources, options, stream, mr)};
-}
 }  // namespace json
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
deleted file mode 100644
index 22c016c6613..00000000000
--- a/cpp/src/io/json/reader_impl.hpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file reader_impl.hpp
- * @brief cuDF-IO JSON reader class implementation header
- */
-
-#pragma once
-
-#include "json_common.h"
-#include "json_gpu.h"
-
-#include <io/utilities/column_buffer.hpp>
-
-#include <hash/concurrent_unordered_map.cuh>
-
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/json.hpp>
-#include <cudf/io/json.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace json {
-using namespace cudf::io::json;
-using namespace cudf::io;
-
-using col_map_type     = cudf::io::json::gpu::col_map_type;
-using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;
-
-/**
- * @brief Class used to parse Json input and convert it into gdf columns.
- */
-class reader_impl {
- private:
-  /**
-   * @brief Ingest input JSON file/buffer, without decompression
-   *
-   * Sets the source_, byte_range_offset_, and byte_range_size_ data members
-   *
-   * @param[in] buffer Buffer to read the bytes in to
-   * @param[in] range_offset Number of bytes offset from the start
-   * @param[in] range_size Bytes to read; use `0` for all remaining data
-   * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
-   */
-  std::vector<char> ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
-                                     compression_type compression,
-                                     size_t range_offset,
-                                     size_t range_size,
-                                     size_t range_size_padded);
-
-  /**
-   * @brief Extract the JSON objects keys from the input file with object rows.
-   *
-   * @return Array of keys and a map that maps their hash values to column indices
-   */
-  std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
-    parse_options_view const& parse_opts,
-    host_span<char const> h_data,
-    device_span<uint64_t const> rec_starts,
-    device_span<char const> d_data,
-    rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Finds all record starts in the file.
-   *
-   * Does not upload the entire file to the GPU
-   *
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   * @return Record starts in the device memory
-   */
-  rmm::device_uvector<uint64_t> find_record_starts(json_reader_options const& reader_opts,
-                                                   host_span<char const> h_data,
-                                                   device_span<char const> d_data,
-                                                   rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Uploads the relevant segment of the input json data onto the GPU.
-   *
-   * Sets the d_data_ data member.
-   * Only rows that need to be parsed are copied, based on the byte range
-   * Also updates the array of record starts to match the device data offset.
-   */
-  rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reader_opts,
-                                                  host_span<char const> h_data,
-                                                  rmm::device_uvector<uint64_t>& rec_starts,
-                                                  rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Parse the first row to set the column name
-   *
-   * Sets the column_names_ data member
-   *
-   * @param[in] rec_starts Record starts in device memory
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   */
-  std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
-    parse_options_view const& parse_opts,
-    host_span<char const> h_data,
-    device_span<uint64_t const> rec_starts,
-    device_span<char const> d_data,
-    rmm::cuda_stream_view stream);
-
-  std::vector<data_type> parse_data_types(std::vector<std::string> const& column_names,
-                                          std::vector<std::string> const& types_as_strings);
-
-  /**
-   * @brief Set the data type array data member
-   *
-   * If user does not pass the data types, deduces types from the file content
-   *
-   * @param[in] reader_opts Settings for controlling reading behavior
-   * @param[in] rec_starts Record starts in device memory
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   */
-  std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
-                                        parse_options_view const& parse_opts,
-                                        std::vector<std::string> const& column_names,
-                                        col_map_type* column_map,
-                                        device_span<uint64_t const> rec_starts,
-                                        device_span<char const> data,
-                                        rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Parse the input data and store results a table
-   *
-   * @param[in] rec_starts Record starts in device memory
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   * @param[in] mr Device memory resource to use for device memory allocation
-   *
-   * @return Table and its metadata
-   */
-  table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
-                                            std::vector<data_type> const& dtypes,
-                                            std::vector<std::string> const& column_names,
-                                            col_map_type* column_map,
-                                            device_span<uint64_t const> rec_starts,
-                                            device_span<char const> data,
-                                            rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
-
- public:
-  /**
-   * @brief Read an entire set or a subset of data from the source
-   *
-   * @param[in] sources Input `datasource` objects to read the dataset from
-   * @param[in] options Settings for controlling reading behavior
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   * @param[in] mr Device memory resource to use for device memory allocation
-   *
-   * @return Table and its metadata
-   */
-  table_with_metadata read(std::vector<std::unique_ptr<datasource>>& sources,
-                           json_reader_options const& options,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr);
-};
-
-}  // namespace json
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf

From c658af184707d976385e02d6362bbdca5a315be2 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 23:14:41 -0500
Subject: [PATCH 26/32] remove deprected file header format

---
 cpp/src/io/json/reader_impl.cu | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 745bfd40888..28954b413fa 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -14,11 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * @file reader_impl.cu
- * @brief cuDF-IO JSON reader class implementation
- */
-
 #include "json_common.h"
 #include "json_gpu.h"
 

From a1945afb0bfb247e4bed39ee535893badafb748c Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Fri, 20 Aug 2021 23:24:49 -0500
Subject: [PATCH 27/32] remove json_common.h

---
 cpp/src/io/json/json_common.h  | 23 -----------------------
 cpp/src/io/json/json_gpu.cu    |  5 ++++-
 cpp/src/io/json/json_gpu.h     |  2 +-
 cpp/src/io/json/reader_impl.cu |  2 +-
 4 files changed, 6 insertions(+), 26 deletions(-)
 delete mode 100644 cpp/src/io/json/json_common.h

diff --git a/cpp/src/io/json/json_common.h b/cpp/src/io/json/json_common.h
deleted file mode 100644
index 803b937e58d..00000000000
--- a/cpp/src/io/json/json_common.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/types.hpp>
-#include <io/utilities/column_buffer.hpp>
-#include <io/utilities/column_type_histogram.hpp>
-
-using cudf::io::detail::string_index_pair;
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index d3930daefd2..ac47ef552dc 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "json_common.h"
 #include "json_gpu.h"
 
 #include <io/csv/datetime.cuh>
+#include <io/utilities/column_type_histogram.hpp>
 #include <io/utilities/parsing_utils.cuh>
 
 #include <cudf/detail/utilities/hash_functions.cuh>
@@ -25,6 +25,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.cuh>
 #include <cudf/strings/string_view.cuh>
+#include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -511,6 +512,8 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
 
     current = desc.value_end + 1;
 
+    using string_index_pair = thrust::pair<const char*, size_type>;
+
     // Empty fields are not legal values
     if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
       // Type dispatcher does not handle strings
diff --git a/cpp/src/io/json/json_gpu.h b/cpp/src/io/json/json_gpu.h
index 7a6bce5e5a5..92024c3e8e6 100644
--- a/cpp/src/io/json/json_gpu.h
+++ b/cpp/src/io/json/json_gpu.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <io/utilities/column_type_histogram.hpp>
 #include <io/utilities/parsing_utils.cuh>
-#include "json_common.h"
 
 #include <hash/concurrent_unordered_map.cuh>
 
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 28954b413fa..a9d55a6f743 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "json_common.h"
 #include "json_gpu.h"
 
 #include <hash/concurrent_unordered_map.cuh>
@@ -35,6 +34,7 @@
 #include <cudf/sorting.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 

From cf5867fe9c40346df658137fe9542a5a6302f071 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 25 Aug 2021 11:30:49 -0500
Subject: [PATCH 28/32] re-delete json reader_impl.hpp

---
 cpp/src/io/json/reader_impl.hpp | 205 --------------------------------
 1 file changed, 205 deletions(-)
 delete mode 100644 cpp/src/io/json/reader_impl.hpp

diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
deleted file mode 100644
index 4d14edf360a..00000000000
--- a/cpp/src/io/json/reader_impl.hpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file reader_impl.hpp
- * @brief cuDF-IO JSON reader class implementation header
- */
-
-#pragma once
-
-#include "json_common.h"
-#include "json_gpu.h"
-
-#include <io/utilities/column_buffer.hpp>
-
-#include <hash/concurrent_unordered_map.cuh>
-
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/json.hpp>
-#include <cudf/io/json.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace json {
-using namespace cudf::io::json;
-using namespace cudf::io;
-
-using col_map_type     = cudf::io::json::gpu::col_map_type;
-using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;
-
-/**
- * @brief Class used to parse Json input and convert it into gdf columns.
- */
-class reader::impl {
- public:
- private:
-  const json_reader_options options_{};
-
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-
-  std::vector<std::unique_ptr<datasource>> sources_;
-  std::vector<uint8_t> buffer_;
-
-  const char* uncomp_data_ = nullptr;
-  size_t uncomp_size_      = 0;
-
-  // Used when the input data is compressed, to ensure the allocated uncompressed data is freed
-  std::vector<char> uncomp_data_owner_;
-  rmm::device_buffer data_;
-
-  size_t byte_range_offset_ = 0;
-  size_t byte_range_size_   = 0;
-  bool load_whole_source_   = true;
-
-  table_metadata metadata_;
-  std::vector<data_type> dtypes_;
-
-  // the map is only used for files with rows in object format; initialize to a dummy value so the
-  // map object can be passed to the kernel in any case
-  col_map_ptr_type key_to_col_idx_map_;
-  std::unique_ptr<rmm::device_scalar<col_map_type>> d_key_col_map_;
-
-  // parsing options
-  const bool allow_newlines_in_strings_ = false;
-  parse_options opts_{',', '\n', '\"', '.'};
-
-  /**
-   * @brief Sets the column map data member and makes a device copy to be used as a kernel
-   * parameter.
-   */
-  void set_column_map(col_map_ptr_type&& map, rmm::cuda_stream_view stream)
-  {
-    key_to_col_idx_map_ = std::move(map);
-    d_key_col_map_ =
-      std::make_unique<rmm::device_scalar<col_map_type>>(*key_to_col_idx_map_, stream);
-  }
-  /**
-   * @brief Gets the pointer to the column hash map in the device memory.
-   *
-   * Returns `nullptr` if the map is not created.
-   */
-  auto get_column_map_device_ptr()
-  {
-    return key_to_col_idx_map_ ? d_key_col_map_->data() : nullptr;
-  }
-
-  /**
-   * @brief Ingest input JSON file/buffer, without decompression
-   *
-   * Sets the source_, byte_range_offset_, and byte_range_size_ data members
-   *
-   * @param[in] range_offset Number of bytes offset from the start
-   * @param[in] range_size Bytes to read; use `0` for all remaining data
-   * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
-   */
-  void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded);
-
-  /**
-   * @brief Extract the JSON objects keys from the input file with object rows.
-   *
-   * @return Array of keys and a map that maps their hash values to column indices
-   */
-  std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
-    device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Decompress the input data, if needed
-   *
-   * Sets the uncomp_data_ and uncomp_size_ data members
-   */
-  void decompress_input(rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Finds all record starts in the file.
-   *
-   * Does not upload the entire file to the GPU
-   *
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   * @return Record starts in the device memory
-   */
-  rmm::device_uvector<uint64_t> find_record_starts(rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Uploads the relevant segment of the input json data onto the GPU.
-   *
-   * Sets the d_data_ data member.
-   * Only rows that need to be parsed are copied, based on the byte range
-   * Also updates the array of record starts to match the device data offset.
-   */
-  void upload_data_to_device(rmm::device_uvector<uint64_t>& rec_starts,
-                             rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Parse the first row to set the column name
-   *
-   * Sets the column_names_ data member
-   *
-   * @param[in] rec_starts Record starts in device memory
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Set the data type array data member
-   *
-   * If user does not pass the data types, deduces types from the file content
-   *
-   * @param[in] rec_starts Record starts in device memory
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void set_data_types(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Parse the input data and store results a table
-   *
-   * @param[in] rec_starts Record starts in device memory
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return Table and its metadata
-   */
-  table_with_metadata convert_data_to_table(device_span<uint64_t const> rec_starts,
-                                            rmm::cuda_stream_view stream);
-
- public:
-  /**
-   * @brief Constructor from a dataset source with reader options.
-   */
-  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                json_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Read an entire set or a subset of data from the source
-   *
-   * @param[in] options Settings for controlling reading behavior
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return Table and its metadata
-   */
-  table_with_metadata read(json_reader_options const& options, rmm::cuda_stream_view stream);
-};
-
-}  // namespace json
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf

From bd69fbd40d2e490ade7f28376036210c11bf5343 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 25 Aug 2021 11:34:06 -0500
Subject: [PATCH 29/32] fix bad merge where changes in 9079 were deleted.

---
 cpp/src/io/json/reader_impl.cu | 58 +++++-----------------------------
 1 file changed, 8 insertions(+), 50 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index dd3c14ad9ad..c23f1482234 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -26,7 +26,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
@@ -394,48 +393,6 @@ std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
   }
 }
 
-std::vector<data_type> parse_data_types(std::vector<std::string> const& column_names,
-                                        std::vector<std::string> const& types_as_strings)
-{
-  CUDF_EXPECTS(types_as_strings.size() == column_names.size(),
-               "Need to specify the type of each column.\n");
-  std::vector<data_type> dtypes;
-  // Assume that the dtype is in dictionary format only if all elements contain a colon
-  const bool is_dict = std::all_of(
-    std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) {
-      return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
-    });
-
-  auto split_on_colon = [](std::string_view s) {
-    auto const i = s.find(":");
-    return std::pair{s.substr(0, i), s.substr(i + 1)};
-  };
-
-  if (is_dict) {
-    std::map<std::string, data_type> col_type_map;
-    std::transform(
-      std::cbegin(types_as_strings),
-      std::cend(types_as_strings),
-      std::inserter(col_type_map, col_type_map.end()),
-      [&](auto const& ts) {
-        auto const [col_name, type_str] = split_on_colon(ts);
-        return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
-      });
-
-    // Using the map here allows O(n log n) complexity
-    std::transform(std::cbegin(column_names),
-                   std::cend(column_names),
-                   std::back_inserter(dtypes),
-                   [&](auto const& column_name) { return col_type_map[column_name]; });
-  } else {
-    std::transform(std::cbegin(types_as_strings),
-                   std::cend(types_as_strings),
-                   std::back_inserter(dtypes),
-                   [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
-  }
-  return dtypes;
-}
-
 std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
                                       parse_options_view const& parse_opts,
                                       std::vector<std::string> const& column_names,
@@ -449,11 +406,15 @@ std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
 
   if (!has_to_infer_column_types) {
     return std::visit(cudf::detail::visitor_overload{
-                        [&](const std::vector<data_type>& dtypes) { return dtypes; },
+                        [&](const std::vector<data_type>& dtypes) {
+                          CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(),
+                                       "Must specify types for all columns");
+                          return dtypes;
+                        },
                         [&](const std::map<std::string, data_type>& dtypes) {
                           std::vector<data_type> sorted_dtypes;
-                          std::transform(std::cbegin(column_names),
-                                         std::cend(column_names),
+                          std::transform(std::cbegin(metadata_.column_names),
+                                         std::cend(metadata_.column_names),
                                          std::back_inserter(sorted_dtypes),
                                          [&](auto const& column_name) {
                                            auto const it = dtypes.find(column_name);
@@ -462,11 +423,8 @@ std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
                                            return it->second;
                                          });
                           return sorted_dtypes;
-                        },
-                        [&](std::vector<std::string> const& dtypes) {
-                          return parse_data_types(column_names, dtypes);
                         }},
-                      reader_opts.get_dtypes());
+                      options_.get_dtypes());
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = column_names.size();

From 33d3cb7f7b183392cd43516d1e703ee76a78b8e3 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 25 Aug 2021 22:00:52 -0500
Subject: [PATCH 30/32] read_json: fix missing visitor_overload include

---
 cpp/src/io/json/reader_impl.cu | 40 +++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index c23f1482234..3f11c4ed7f2 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -26,6 +26,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
@@ -405,26 +406,25 @@ std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
     std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
 
   if (!has_to_infer_column_types) {
-    return std::visit(cudf::detail::visitor_overload{
-                        [&](const std::vector<data_type>& dtypes) {
-                          CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(),
-                                       "Must specify types for all columns");
-                          return dtypes;
-                        },
-                        [&](const std::map<std::string, data_type>& dtypes) {
-                          std::vector<data_type> sorted_dtypes;
-                          std::transform(std::cbegin(metadata_.column_names),
-                                         std::cend(metadata_.column_names),
-                                         std::back_inserter(sorted_dtypes),
-                                         [&](auto const& column_name) {
-                                           auto const it = dtypes.find(column_name);
-                                           CUDF_EXPECTS(it != dtypes.end(),
-                                                        "Must specify types for all columns");
-                                           return it->second;
-                                         });
-                          return sorted_dtypes;
-                        }},
-                      options_.get_dtypes());
+    return std::visit(
+      cudf::detail::visitor_overload{
+        [&](const std::vector<data_type>& dtypes) {
+          CUDF_EXPECTS(dtypes.size() == column_names.size(), "Must specify types for all columns");
+          return dtypes;
+        },
+        [&](const std::map<std::string, data_type>& dtypes) {
+          std::vector<data_type> sorted_dtypes;
+          std::transform(std::cbegin(column_names),
+                         std::cend(column_names),
+                         std::back_inserter(sorted_dtypes),
+                         [&](auto const& column_name) {
+                           auto const it = dtypes.find(column_name);
+                           CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
+                           return it->second;
+                         });
+          return sorted_dtypes;
+        }},
+      reader_opts.get_dtypes());
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = column_names.size();

From 5e0ff9ae38c27d9db567ef46659a0b590481574e Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 11 Nov 2021 10:09:29 -0600
Subject: [PATCH 31/32] remove unnecessary doc comments and div-by-1

---
 cpp/include/cudf/io/detail/json.hpp | 15 +++++----------
 cpp/src/io/json/reader_impl.cu      |  2 +-
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 7ab8906e5a9..ca490b2619e 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -12,12 +12,7 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */
-
-/**
- * @file json.hpp
- * @brief cuDF-IO reader classes API
- */
+ */\
 
 #pragma once
 
@@ -33,10 +28,10 @@ namespace json {
 /**
  * @brief Reads and returns the entire data set.
  *
- * @param[in] sources Input `datasource` objects to read the dataset from
- * @param[in] options Settings for controlling reading behavior
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- * @param[in] mr Device memory resource to use for device memory allocation
+ * @param sources Input `datasource` objects to read the dataset from
+ * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
  *
  * @return cudf::table object that contains the array of cudf::column.
  */
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 0d819930ac9..319906111af 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -345,7 +345,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
   rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
-  uint64_t first_row_len = d_data.size() / sizeof(char);
+  uint64_t first_row_len = d_data.size();
   if (rec_starts.size() > 1) {
     // Set first_row_len to the offset of the second row, if it exists
     CUDA_TRY(cudaMemcpyAsync(&first_row_len,

From cf428385e9393c0c6b21a75238a02e7657f97ed6 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 11 Nov 2021 10:28:41 -0600
Subject: [PATCH 32/32] fix formatting issue

---
 cpp/include/cudf/io/detail/json.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index ca490b2619e..69b26a7b70a 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -12,7 +12,7 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */\
+ */
 
 #pragma once