From 9c72e56837ddfb3fb9b3d1111cdd08e1f53595c4 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 05:08:16 -0500 Subject: [PATCH 01/32] simplify io/functions.cpp data source/sink factories --- cpp/src/io/functions.cpp | 126 +++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 59 deletions(-) diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index bf51012211c..e080ea3a2ca 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -106,67 +106,56 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( } namespace { -template -std::unique_ptr make_reader(source_info const& src_info, - reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - if (src_info.type == io_type::FILEPATH) { - return std::make_unique(src_info.filepaths, options, stream, mr); - } - std::vector> datasources; - if (src_info.type == io_type::HOST_BUFFER) { - datasources = cudf::io::datasource::create(src_info.buffers); - } else if (src_info.type == io_type::USER_IMPLEMENTED) { - datasources = cudf::io::datasource::create(src_info.user_sources); - } else { - CUDF_FAIL("Unsupported source type"); +std::vector> make_datasources(source_info const& info) +{ + switch (info.type) { + case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths); + case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers); + case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources); + default: CUDF_FAIL("Unsupported source type"); } - - return std::make_unique(std::move(datasources), options, stream, mr); } -template -std::unique_ptr make_writer(sink_info const& sink, Ts&&... args) +std::unique_ptr make_datasink(sink_info const& info) { - if (sink.type == io_type::FILEPATH) { - return std::make_unique(cudf::io::data_sink::create(sink.filepath), - std::forward(args)...); - } - if (sink.type == io_type::HOST_BUFFER) { - return std::make_unique(cudf::io::data_sink::create(sink.buffer), - std::forward(args)...); + switch (info.type) { + case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath); + case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer); + case io_type::VOID: return cudf::io::data_sink::create(); + case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink); + default: CUDF_FAIL("Unsupported sink type"); } - if (sink.type == io_type::VOID) { - return std::make_unique(cudf::io::data_sink::create(), std::forward(args)...); - } - if (sink.type == io_type::USER_IMPLEMENTED) { - return std::make_unique(cudf::io::data_sink::create(sink.user_sink), - std::forward(args)...); - } - CUDF_FAIL("Unsupported sink type"); } } // namespace -table_with_metadata read_avro(avro_reader_options const& opts, rmm::mr::device_memory_resource* mr) +table_with_metadata read_avro(avro_reader_options const& options, + rmm::mr::device_memory_resource* mr) { namespace avro = cudf::io::detail::avro; CUDF_FUNC_RANGE(); - auto reader = make_reader(opts.get_source(), opts, rmm::cuda_stream_default, mr); - return reader->read(opts); + + auto datasources = make_datasources(options.get_source()); + auto reader = + std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); + + return reader->read(options); } -table_with_metadata read_json(json_reader_options const& opts, rmm::mr::device_memory_resource* mr) +table_with_metadata read_json(json_reader_options const& options, + rmm::mr::device_memory_resource* mr) { namespace json = cudf::io::detail::json; CUDF_FUNC_RANGE(); - auto reader = make_reader(opts.get_source(), opts, rmm::cuda_stream_default, mr); - return reader->read(opts); + + auto datasources = make_datasources(options.get_source()); + auto reader = + std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); + + return reader->read(options); } table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr) @@ -174,8 +163,10 @@ table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_ namespace csv = cudf::io::detail::csv; CUDF_FUNC_RANGE(); + + auto datasources = make_datasources(options.get_source()); auto reader = - make_reader(options.get_source(), options, rmm::cuda_stream_default, mr); + std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(); } @@ -185,7 +176,9 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc { using namespace cudf::io::detail; - auto writer = make_writer(options.get_sink(), options, rmm::cuda_stream_default, mr); + auto sink = make_datasink(options.get_sink()); + auto writer = + std::make_unique(std::move(sink), options, rmm::cuda_stream_default, mr); writer->write(options.get_table(), options.get_metadata()); } @@ -294,8 +287,10 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info) table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto reader = - make_reader(options.get_source(), options, rmm::cuda_stream_default, mr); + + auto datasources = make_datasources(options.get_source()); + auto reader = std::make_unique( + std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(options); } @@ -305,11 +300,13 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_ */ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resource* mr) { + namespace io_detail = cudf::io::detail; + CUDF_FUNC_RANGE(); - namespace io_detail = cudf::io::detail; - auto writer = make_writer( - options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); + auto sink = make_datasink(options.get_sink()); + auto writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); writer->write(options.get_table()); } @@ -317,12 +314,15 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc /** * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer */ -orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& op, +orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options, rmm::mr::device_memory_resource* mr) { namespace io_detail = cudf::io::detail; - writer = make_writer( - op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); + + auto sink = make_datasink(options.get_sink()); + + writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); } /** @@ -354,8 +354,10 @@ table_with_metadata read_parquet(parquet_reader_options const& options, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto reader = make_reader( - options.get_source(), options, rmm::cuda_stream_default, mr); + + auto datasources = make_datasources(options.get_source()); + auto reader = std::make_unique( + std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(options); } @@ -392,25 +394,31 @@ table_input_metadata::table_input_metadata(table_view const& table, std::unique_ptr> write_parquet(parquet_writer_options const& options, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); namespace io_detail = cudf::io::detail; - auto writer = make_writer( - options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); + CUDF_FUNC_RANGE(); + + auto sink = make_datasink(options.get_sink()); + auto writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); writer->write(options.get_table()); + return writer->close(options.get_column_chunks_file_path()); } /** * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer */ -parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& op, +parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options, rmm::mr::device_memory_resource* mr) { namespace io_detail = cudf::io::detail; - writer = make_writer( - op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); + + auto sink = make_datasink(options.get_sink()); + + writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); } /** From 88e23990151c737dcb4a22a5d6454ef8893285c4 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 17 Aug 2021 00:53:48 -0500 Subject: [PATCH 02/32] remove filepath-related logic from csv and json readers --- cpp/include/cudf/io/csv.hpp | 2 +- cpp/include/cudf/io/json.hpp | 2 +- cpp/src/io/comp/io_uncomp.h | 7 +++-- cpp/src/io/comp/uncomp.cpp | 19 ++++++------ cpp/src/io/csv/reader_impl.cu | 38 ++++++------------------ cpp/src/io/csv/reader_impl.hpp | 4 --- cpp/src/io/functions.cpp | 40 ++++++++++++++++++++++++-- cpp/src/io/json/reader_impl.cu | 31 ++------------------ cpp/src/io/json/reader_impl.hpp | 1 - cpp/src/io/utilities/parsing_utils.cu | 34 ---------------------- cpp/src/io/utilities/parsing_utils.cuh | 18 ------------ python/cudf/cudf/_lib/csv.pyx | 2 +- python/cudf/cudf/tests/test_csv.py | 14 --------- 13 files changed, 66 insertions(+), 146 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index d4a21b2e98c..c807f189aac 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1199,7 +1199,7 @@ class csv_reader_options_builder { * @return The set of columns along with metadata. */ table_with_metadata read_csv( - csv_reader_options const& options, + csv_reader_options options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 8954f7dcab1..bca60f76260 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -364,7 +364,7 @@ class json_reader_options_builder { * @return The set of columns along with metadata. */ table_with_metadata read_json( - json_reader_options const& options, + json_reader_options options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/io/comp/io_uncomp.h b/cpp/src/io/comp/io_uncomp.h index 8daf73ecd0c..7b1feb84813 100644 --- a/cpp/src/io/comp/io_uncomp.h +++ b/cpp/src/io/comp/io_uncomp.h @@ -16,12 +16,13 @@ #pragma once +#include +#include + #include #include #include -#include - using cudf::host_span; namespace cudf { @@ -42,7 +43,7 @@ enum { std::vector io_uncompress_single_h2d(void const* src, size_t src_size, int stream_type); -std::vector get_uncompressed_data(host_span data, std::string const& compression); +std::vector get_uncompressed_data(host_span data, compression_type compression); class HostDecompressor { public: diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 2cb99d897fe..ee451d04dbb 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -369,6 +369,7 @@ std::vector io_uncompress_single_h2d(const void* src, size_t src_size, int // Unsupported format break; } + CUDF_EXPECTS(comp_data != nullptr, "Unsupported compressed stream type"); CUDF_EXPECTS(comp_len > 0, "Unsupported compressed stream type"); @@ -422,17 +423,17 @@ std::vector io_uncompress_single_h2d(const void* src, size_t src_size, int * @return Vector containing the output uncompressed data */ std::vector get_uncompressed_data(host_span const data, - std::string const& compression) + compression_type compression) { int comp_type = IO_UNCOMP_STREAM_TYPE_INFER; - if (compression == "gzip") - comp_type = IO_UNCOMP_STREAM_TYPE_GZIP; - else if (compression == "zip") - comp_type = IO_UNCOMP_STREAM_TYPE_ZIP; - else if (compression == "bz2") - comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2; - else if (compression == "xz") - comp_type = IO_UNCOMP_STREAM_TYPE_XZ; + + switch (compression) { + case compression_type::GZIP: comp_type = IO_UNCOMP_STREAM_TYPE_GZIP; break; + case compression_type::ZIP: comp_type = IO_UNCOMP_STREAM_TYPE_ZIP; break; + case compression_type::BZIP2: comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2; break; + case compression_type::XZ: comp_type = IO_UNCOMP_STREAM_TYPE_XZ; break; + default: break; + } return io_uncompress_single_h2d(data.data(), data.size(), comp_type); } diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 549b0474fe1..a85a610962e 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -206,10 +206,12 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) auto num_rows = opts_.get_nrows(); if (range_offset > 0 || range_size > 0) { - CUDF_EXPECTS(compression_type_ == "none", + CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE, "Reading compressed data using `byte range` is unsupported"); } + size_t map_range_size = 0; + if (range_size != 0) { auto num_given_dtypes = std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes()); @@ -217,12 +219,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) map_range_size = range_size + calculateMaxRowSize(num_columns); } - // Support delayed opening of the file if using memory mapping datasource - // This allows only mapping of a subset of the file if using byte range - if (source_ == nullptr) { - assert(!filepath_.empty()); - source_ = datasource::create(filepath_, range_offset, map_range_size); - } + // TODO: provide hint to datasource that we should memory map any underlying file. // Transfer source data to GPU if (!source_->is_empty()) { @@ -235,10 +232,11 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) std::vector h_uncomp_data_owner; - if (compression_type_ != "none") { - h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type_); + if (opts_.get_compression() != compression_type::NONE) { + h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression()); h_data = h_uncomp_data_owner; } + // None of the parameters for row selection is used, we are parsing the entire file const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 && skip_end_rows <= 0 && num_rows == -1; @@ -927,35 +925,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, } reader::impl::impl(std::unique_ptr source, - std::string filepath, csv_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options) + : mr_(mr), source_(std::move(source)), opts_(options) { num_actual_cols_ = opts_.get_names().size(); num_active_cols_ = num_actual_cols_; - compression_type_ = - infer_compression_type(opts_.get_compression(), - filepath, - {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}}); - opts = make_parse_options(options, stream); } -// Forward to implementation -reader::reader(std::vector const& filepaths, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported."); - // Delay actual instantiation of data source until read to allow for - // partial memory mapping of file using byte ranges - _impl = std::make_unique(nullptr, filepaths[0], options, stream, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, csv_reader_options const& options, @@ -963,7 +943,7 @@ reader::reader(std::vector>&& sources, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); - _impl = std::make_unique(std::move(sources[0]), "", options, stream, mr); + _impl = std::make_unique(std::move(sources[0]), options, stream, mr); } // Destructor within this translation unit diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 36c2bf4f9e7..beaa9b816cb 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -72,13 +72,11 @@ class reader::impl { * @brief Constructor from a dataset source with reader options. * * @param source Dataset source - * @param filepath Filepath if reading dataset from a file * @param options Settings for controlling reading behavior * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ explicit impl(std::unique_ptr source, - std::string filepath, csv_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); @@ -222,8 +220,6 @@ class reader::impl { private: rmm::mr::device_memory_resource* mr_ = nullptr; std::unique_ptr source_; - std::string filepath_; - std::string compression_type_; const csv_reader_options opts_; cudf::size_type num_records_ = 0; // Number of rows with actual data diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index e080ea3a2ca..ccc2eef56c7 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -144,27 +144,61 @@ table_with_metadata read_avro(avro_reader_options const& options, return reader->read(options); } -table_with_metadata read_json(json_reader_options const& options, - rmm::mr::device_memory_resource* mr) +compression_type infer_compression_type(compression_type compression, source_info const& info) +{ + if (compression != compression_type::AUTO) { return compression; } + + if (info.type != io_type::FILEPATH) { return compression_type::NONE; } + + auto filepath = info.filepaths[0]; + + // Attempt to infer from the file extension + const auto pos = filepath.find_last_of('.'); + + if (pos == std::string::npos) { return {}; } + + auto str_tolower = [](const auto& begin, const auto& end) { + std::string out; + std::transform(begin, end, std::back_inserter(out), ::tolower); + return out; + }; + + const auto ext = str_tolower(filepath.begin() + pos + 1, filepath.end()); + + if (ext == "gz") { return compression_type::GZIP; } + if (ext == "zip") { return compression_type::ZIP; } + if (ext == "bz2") { return compression_type::BZIP2; } + if (ext == "xz") { return compression_type::XZ; } + + return compression_type::NONE; +} + +table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr) { namespace json = cudf::io::detail::json; CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); + + options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(options); } -table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr) +table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) { namespace csv = cudf::io::detail::csv; CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); + + options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index a8f117c22bf..bae7471e307 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -241,15 +241,6 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) map_range_size = range_size + calculate_max_row_size(dtype_option_size); } - // Support delayed opening of the file if using memory mapping datasource - // This allows only mapping of a subset of the file if using byte range - if (sources_.empty()) { - assert(!filepaths_.empty()); - for (const auto& path : filepaths_) { - sources_.emplace_back(datasource::create(path, range_offset, map_range_size)); - } - } - // Iterate through the user defined sources and read the contents into the local buffer CUDF_EXPECTS(!sources_.empty(), "No sources were defined"); size_t total_source_size = 0; @@ -280,11 +271,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) */ void reader::impl::decompress_input(rmm::cuda_stream_view stream) { - const auto compression_type = - infer_compression_type(options_.get_compression(), - filepaths_.size() > 0 ? filepaths_[0] : "", - {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}}); - if (compression_type == "none") { + if (options_.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy uncomp_data_ = reinterpret_cast(buffer_.data()); uncomp_size_ = buffer_.size(); @@ -293,7 +280,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream) host_span( // reinterpret_cast(buffer_.data()), buffer_.size()), - compression_type); + options_.get_compression()); uncomp_data_ = uncomp_data_owner_.data(); uncomp_size_ = uncomp_data_owner_.size(); @@ -665,7 +652,7 @@ reader::impl::impl(std::vector>&& sources, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : options_(options), mr_(mr), sources_(std::move(sources)), filepaths_(filepaths) + : options_(options), mr_(mr), sources_(std::move(sources)) { CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); @@ -713,18 +700,6 @@ table_with_metadata reader::impl::read(json_reader_options const& options, return convert_data_to_table(rec_starts, stream); } -// Forward to implementation -reader::reader(std::vector const& filepaths, - json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Delay actual instantiation of data source until read to allow for - // partial memory mapping of file using byte ranges - std::vector> src = {}; // Empty datasources - _impl = std::make_unique(std::move(src), filepaths, options, stream, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, json_reader_options const& options, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 5cf51369cdf..f7af55b2b90 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -57,7 +57,6 @@ class reader::impl { rmm::mr::device_memory_resource* mr_ = nullptr; std::vector> sources_; - std::vector filepaths_; std::vector buffer_; const char* uncomp_data_ = nullptr; diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu index 6c8f01111e5..ba62238c5d3 100644 --- a/cpp/src/io/utilities/parsing_utils.cu +++ b/cpp/src/io/utilities/parsing_utils.cu @@ -209,39 +209,5 @@ cudf::size_type count_all_from_set(const char* h_data, return find_all_from_set(h_data, h_size, keys, 0, nullptr, stream); } -std::string infer_compression_type( - const compression_type& compression_arg, - const std::string& filename, - const std::vector>& ext_to_comp_map) -{ - auto str_tolower = [](const auto& begin, const auto& end) { - std::string out; - std::transform(begin, end, std::back_inserter(out), ::tolower); - return out; - }; - - // Attempt to infer from user-supplied argument - if (compression_arg != compression_type::AUTO) { - switch (compression_arg) { - case compression_type::GZIP: return "gzip"; - case compression_type::BZIP2: return "bz2"; - case compression_type::ZIP: return "zip"; - case compression_type::XZ: return "xz"; - default: break; - } - } - - // Attempt to infer from the file extension - const auto pos = filename.find_last_of('.'); - if (pos != std::string::npos) { - const auto ext = str_tolower(filename.begin() + pos + 1, filename.end()); - for (const auto& mapping : ext_to_comp_map) { - if (mapping.first == ext) { return mapping.second; } - } - } - - return "none"; -} - } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 88297423b9b..daf23de7eb2 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -454,24 +454,6 @@ cudf::size_type count_all_from_set(const char* h_data, const std::vector& keys, rmm::cuda_stream_view stream); -/** - * @brief Infer file compression type based on user supplied arguments. - * - * If the user specifies a valid compression_type for compression arg, - * compression type will be computed based on that. Otherwise the filename - * and ext_to_comp_map will be used. - * - * @param[in] compression_arg User specified compression type (if any) - * @param[in] filename Filename to base compression type (by extension) on - * @param[in] ext_to_comp_map User supplied mapping of file extension to compression type - * - * @return string representing compression type ("gzip, "bz2", etc) - */ -std::string infer_compression_type( - const compression_type& compression_arg, - const std::string& filename, - const std::vector>& ext_to_comp_map); - /** * @brief Checks whether the given character is a whitespace character. * diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index a15a180d466..7a54ccac197 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -101,7 +101,7 @@ cdef csv_reader_options make_csv_reader_options( bool na_filter, object prefix, object index_col, -) except +: +) except *: cdef source_info c_source_info = make_source_info([datasource]) cdef compression_type c_compression cdef size_type c_header diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 5511a65d0a4..8fb5d7cc9eb 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1069,20 +1069,6 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes): assert list(df["int2"]) == list(ref_df["int2"]) -def test_csv_reader_byte_range_type_corner_case(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv") - - cudf.datasets.timeseries( - start="2000-01-01", - end="2000-01-02", - dtypes={"name": str, "id": int, "x": float, "y": float}, - ).to_csv(fname, chunksize=100000) - - byte_range = (2_147_483_648, 0) - with pytest.raises(RuntimeError, match="Offset is past end of file"): - cudf.read_csv(fname, byte_range=byte_range, header=None) - - @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36]) def test_csv_reader_byte_range_strings(segment_bytes): names = ["strings"] From 62b95202d9b1db14f765ef45644d9cf91f782ea7 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 17 Aug 2021 02:19:27 -0500 Subject: [PATCH 03/32] remove filepath logic from avro, parquet, orc readers --- cpp/include/cudf/io/detail/avro.hpp | 13 ------------- cpp/include/cudf/io/detail/orc.hpp | 13 ------------- cpp/include/cudf/io/detail/parquet.hpp | 13 ------------- cpp/src/io/avro/reader_impl.cu | 10 ---------- cpp/src/io/orc/reader_impl.cu | 9 --------- cpp/src/io/parquet/reader_impl.cu | 9 --------- 6 files changed, 67 deletions(-) diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp index 98483d1c03e..306c15dcb72 100644 --- a/cpp/include/cudf/io/detail/avro.hpp +++ b/cpp/include/cudf/io/detail/avro.hpp @@ -38,19 +38,6 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - avro_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp index ab26c01db74..2174b688da2 100644 --- a/cpp/include/cudf/io/detail/orc.hpp +++ b/cpp/include/cudf/io/detail/orc.hpp @@ -47,19 +47,6 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - orc_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index d95af7a11da..14f27ef8eef 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -49,19 +49,6 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index f6ffdd99d35..08ea96139a1 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -474,16 +474,6 @@ table_with_metadata reader::impl::read(avro_reader_options const& options, return {std::make_unique(std::move(out_columns)), std::move(metadata_out)}; } -// Forward to implementation -reader::reader(std::vector const& filepaths, - avro_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported."); - _impl = std::make_unique(datasource::create(filepaths[0]), options, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, avro_reader_options const& options, diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 033a2d9aff5..5d62c45df83 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -1383,15 +1383,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; } -// Forward to implementation -reader::reader(std::vector const& filepaths, - orc_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - _impl = std::make_unique(datasource::create(filepaths), options, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, orc_reader_options const& options, diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 9f9bdfd4755..31ae763d9ff 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1608,15 +1608,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; } -// Forward to implementation -reader::reader(std::vector const& filepaths, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _impl(std::make_unique(datasource::create(filepaths), options, mr)) -{ -} - // Forward to implementation reader::reader(std::vector>&& sources, parquet_reader_options const& options, From fb0129433bdd2dd264105ba172d96f2a310d8d8d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Aug 2021 15:19:11 -0500 Subject: [PATCH 04/32] move range size padding calculation out of json/csv reader and in to json/csv options --- cpp/include/cudf/io/csv.hpp | 34 +++++++++++++++++++++ cpp/include/cudf/io/json.hpp | 32 +++++++++++++++++++ cpp/src/io/csv/reader_impl.cu | 49 +++++------------------------- cpp/src/io/functions.cpp | 24 +++++++++++---- cpp/src/io/json/reader_impl.cu | 46 ++++++---------------------- cpp/src/io/json/reader_impl.hpp | 3 +- python/cudf/cudf/tests/test_csv.py | 14 +++++++++ 7 files changed, 116 insertions(+), 86 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index c807f189aac..1aa6e3bea29 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -177,6 +177,40 @@ class csv_reader_options { */ std::size_t get_byte_range_size() const { return _byte_range_size; } + /** + * @brief Returns number of bytes to read with padding. + */ + std::size_t get_byte_range_size_with_padding() const + { + if (_byte_range_size == 0) { + return 0; + } else { + return _byte_range_size + get_byte_range_padding(); + } + } + + /** + * @brief Returns number of bytes to pad when reading. + */ + std::size_t get_byte_range_padding() const + { + auto const num_names = _names.size(); + auto const num_dtypes = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes); + auto const num_columns = std::max(num_dtypes, num_names); + + auto const max_row_bytes = 16 * 1024; // 16KB + auto const column_bytes = 64; + auto const base_padding = 1024; // 1KB + + if (num_columns == 0) { + // Use flat size if the number of columns is not known + return max_row_bytes; + } + + // Expand the size based on the number of columns, if available + return base_padding + num_columns * column_bytes; + } + /** * @brief Returns names of the columns. */ diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index bca60f76260..5d2a4f6fcd1 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -140,6 +140,38 @@ class json_reader_options { */ size_t get_byte_range_size() const { return _byte_range_size; } + /** + * @brief Returns number of bytes to read with padding. + */ + size_t get_byte_range_size_with_padding() const + { + if (_byte_range_size == 0) { + return 0; + } else { + return _byte_range_size + get_byte_range_padding(); + } + } + + /** + * @brief Returns number of bytes to pad when reading. + */ + size_t get_byte_range_padding() const + { + auto const num_columns = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes); + + auto const max_row_bytes = 16 * 1024; // 16KB + auto const column_bytes = 64; + auto const base_padding = 1024; // 1KB + + if (num_columns == 0) { + // Use flat size if the number of columns is not known + return max_row_bytes; + } + + // Expand the size based on the number of columns, if available + return base_padding + num_columns * column_bytes; + } + /** * @brief Whether to read the file as a json object per line. */ diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index a85a610962e..c61cc26800e 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -57,31 +57,6 @@ namespace csv { using namespace cudf::io::csv; using namespace cudf::io; -/** - * @brief Estimates the maximum expected length or a row, based on the number - * of columns - * - * If the number of columns is not available, it will return a value large - * enough for most use cases - * - * @param[in] num_columns Number of columns in the CSV file (optional) - * - * @return Estimated maximum size of a row, in bytes - */ -constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept -{ - constexpr size_t max_row_bytes = 16 * 1024; // 16KB - constexpr size_t column_bytes = 64; - constexpr size_t base_padding = 1024; // 1KB - if (num_columns == 0) { - // Use flat size if the number of columns is not known - return max_row_bytes; - } else { - // Expand the size based on the number of columns, if available - return base_padding + num_columns * column_bytes; - } -} - /** * @brief Translates a dtype string and returns its dtype enumeration and any * extended dtype flags that are supported by cuIO. Often, this is a column @@ -199,31 +174,21 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) std::pair, reader::impl::selected_rows_offsets> reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) { - auto range_offset = opts_.get_byte_range_offset(); - auto range_size = opts_.get_byte_range_size(); - auto skip_rows = opts_.get_skiprows(); - auto skip_end_rows = opts_.get_skipfooter(); - auto num_rows = opts_.get_nrows(); + auto range_offset = opts_.get_byte_range_offset(); + auto range_size = opts_.get_byte_range_size(); + auto range_size_padded = opts_.get_byte_range_size_with_padding(); + auto skip_rows = opts_.get_skiprows(); + auto skip_end_rows = opts_.get_skipfooter(); + auto num_rows = opts_.get_nrows(); if (range_offset > 0 || range_size > 0) { CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE, "Reading compressed data using `byte range` is unsupported"); } - size_t map_range_size = 0; - - if (range_size != 0) { - auto num_given_dtypes = - std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes()); - const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes); - map_range_size = range_size + calculateMaxRowSize(num_columns); - } - - // TODO: provide hint to datasource that we should memory map any underlying file. - // Transfer source data to GPU if (!source_->is_empty()) { - auto data_size = (map_range_size != 0) ? map_range_size : source_->size(); + auto data_size = (range_size_padded != 0) ? range_size_padded : source_->size(); auto buffer = source_->host_read(range_offset, data_size); auto h_data = host_span( // diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index ccc2eef56c7..438cb1762c6 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -107,10 +107,18 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( namespace { -std::vector> make_datasources(source_info const& info) +std::vector> make_datasources(source_info const& info, + size_t range_offset = 0, + size_t range_size = 0) { switch (info.type) { - case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths); + case io_type::FILEPATH: { + auto sources = std::vector>(); + for (auto const& filepath : info.filepaths) { + sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size)); + } + return sources; + } case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers); case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources); default: CUDF_FAIL("Unsupported source type"); @@ -179,10 +187,12 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor CUDF_FUNC_RANGE(); - auto datasources = make_datasources(options.get_source()); - options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto datasources = make_datasources(options.get_source(), + options.get_byte_range_offset(), + options.get_byte_range_size_with_padding()); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); @@ -195,10 +205,12 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_ CUDF_FUNC_RANGE(); - auto datasources = make_datasources(options.get_source()); - options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto datasources = make_datasources(options.get_source(), + options.get_byte_range_offset(), + options.get_byte_range_size_with_padding()); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index bae7471e307..0618f02e98f 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -50,31 +50,6 @@ namespace detail { namespace json { using namespace cudf::io; -namespace { -/** - * @brief Estimates the maximum expected length or a row, based on the number - * of columns - * - * If the number of columns is not available, it will return a value large - * enough for most use cases - * - * @param[in] num_columns Number of columns in the JSON file (optional) - * - * @return Estimated maximum size of a row, in bytes - */ -constexpr size_t calculate_max_row_size(int num_columns = 0) noexcept -{ - constexpr size_t max_row_bytes = 16 * 1024; // 16KB - constexpr size_t column_bytes = 64; - constexpr size_t base_padding = 1024; // 1KB - return num_columns == 0 - ? max_row_bytes // Use flat size if the # of columns is not known - : base_padding + - num_columns * column_bytes; // Expand size based on the # of columns, if available -} - -} // anonymous namespace - /** * @brief Aggregate the table containing keys info by their hash values. * @@ -231,16 +206,12 @@ std::pair, col_map_ptr_type> reader::impl::get_json_obj * * @param[in] range_offset Number of bytes offset from the start * @param[in] range_size Bytes to read; use `0` for all remaining data + * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ -void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) +void reader::impl::ingest_raw_input(size_t range_offset, + size_t range_size, + size_t range_size_padded) { - size_t map_range_size = 0; - if (range_size != 0) { - auto const dtype_option_size = - std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes()); - map_range_size = range_size + calculate_max_row_size(dtype_option_size); - } - // Iterate through the user defined sources and read the contents into the local buffer CUDF_EXPECTS(!sources_.empty(), "No sources were defined"); size_t total_source_size = 0; @@ -253,7 +224,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) size_t bytes_read = 0; for (const auto& source : sources_) { if (!source->is_empty()) { - auto data_size = (map_range_size != 0) ? map_range_size : source->size(); + auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]); } } @@ -675,10 +646,11 @@ reader::impl::impl(std::vector>&& sources, table_with_metadata reader::impl::read(json_reader_options const& options, rmm::cuda_stream_view stream) { - auto range_offset = options.get_byte_range_offset(); - auto range_size = options.get_byte_range_size(); + auto range_offset = options.get_byte_range_offset(); + auto range_size = options.get_byte_range_size(); + auto range_size_padded = options.get_byte_range_size_with_padding(); - ingest_raw_input(range_offset, range_size); + ingest_raw_input(range_offset, range_size, range_size_padded); CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n"); decompress_input(stream); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index f7af55b2b90..d01f2e8677e 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -109,8 +109,9 @@ class reader::impl { * * @param[in] range_offset Number of bytes offset from the start * @param[in] range_size Bytes to read; use `0` for all remaining data + * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ - void ingest_raw_input(size_t range_offset, size_t range_size); + void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded); /** * @brief Extract the JSON objects keys from the input file with object rows. diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 8fb5d7cc9eb..5511a65d0a4 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1069,6 +1069,20 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes): assert list(df["int2"]) == list(ref_df["int2"]) +def test_csv_reader_byte_range_type_corner_case(tmpdir): + fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv") + + cudf.datasets.timeseries( + start="2000-01-01", + end="2000-01-02", + dtypes={"name": str, "id": int, "x": float, "y": float}, + ).to_csv(fname, chunksize=100000) + + byte_range = (2_147_483_648, 0) + with pytest.raises(RuntimeError, match="Offset is past end of file"): + cudf.read_csv(fname, byte_range=byte_range, header=None) + + @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36]) def test_csv_reader_byte_range_strings(segment_bytes): names = ["strings"] From d422aebbe62d7e9915af93f474563e6e1c571e97 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Aug 2021 15:30:38 -0500 Subject: [PATCH 05/32] remove filepaths from json reader --- cpp/src/io/json/reader_impl.cu | 12 +++++------- cpp/src/io/json/reader_impl.hpp | 3 +-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 0618f02e98f..2964a12568f 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -231,7 +231,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, byte_range_offset_ = range_offset; byte_range_size_ = range_size; - load_whole_file_ = byte_range_offset_ == 0 && byte_range_size_ == 0; + load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0; } /** @@ -256,7 +256,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream) uncomp_data_ = uncomp_data_owner_.data(); uncomp_size_ = uncomp_data_owner_.size(); } - if (load_whole_file_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); + if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); } rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_view stream) @@ -268,7 +268,7 @@ rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_ if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); } // If not starting at an offset, add an extra row to account for the first row in the file cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0); - if (load_whole_file_) { + if (load_whole_source_) { prefilter_count += count_all_from_set(data_, chars_to_count, stream); } else { prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream); @@ -286,7 +286,7 @@ rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_ std::vector chars_to_find{'\n'}; if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); } // Passing offset = 1 to return positions AFTER the found character - if (load_whole_file_) { + if (load_whole_source_) { find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream); } else { find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream); @@ -619,7 +619,6 @@ table_with_metadata reader::impl::convert_data_to_table(device_span>&& sources, - std::vector const& filepaths, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -678,8 +677,7 @@ reader::reader(std::vector>&& sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - std::vector file_paths = {}; // Empty filepaths - _impl = std::make_unique(std::move(sources), file_paths, options, stream, mr); + _impl = std::make_unique(std::move(sources), options, stream, mr); } // Destructor within this translation unit diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index d01f2e8677e..d910cce2d72 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -68,7 +68,7 @@ class reader::impl { size_t byte_range_offset_ = 0; size_t byte_range_size_ = 0; - bool load_whole_file_ = true; + bool load_whole_source_ = true; table_metadata metadata_; std::vector dtypes_; @@ -186,7 +186,6 @@ class reader::impl { * @brief Constructor from a dataset source with reader options. */ explicit impl(std::vector>&& sources, - std::vector const& filepaths, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); From e0cac1d39aa5143900ed0fbeb71ea4440a059252 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 17:37:50 -0500 Subject: [PATCH 06/32] replace json reader impl buffer member with local variable --- cpp/src/io/json/reader_impl.cu | 37 +++++++++++++++------------------ cpp/src/io/json/reader_impl.hpp | 9 +++++--- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 2964a12568f..cfda7bb11dc 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -199,16 +199,8 @@ std::pair, col_map_ptr_type> reader::impl::get_json_obj create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; } -/** - * @brief Ingest input JSON file/buffer, without decompression. - * - * Sets the sources_, byte_range_offset_, and byte_range_size_ data members - * - * @param[in] range_offset Number of bytes offset from the start - * @param[in] range_size Bytes to read; use `0` for all remaining data - * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data - */ -void reader::impl::ingest_raw_input(size_t range_offset, +void reader::impl::ingest_raw_input(std::vector& buffer, + size_t range_offset, size_t range_size, size_t range_size_padded) { @@ -220,12 +212,12 @@ void reader::impl::ingest_raw_input(size_t range_offset, } total_source_size = total_source_size - range_offset; - buffer_.resize(total_source_size); + buffer.resize(total_source_size); size_t bytes_read = 0; for (const auto& source : sources_) { if (!source->is_empty()) { auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); - bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]); + bytes_read += source->host_read(range_offset, data_size, &buffer[bytes_read]); } } @@ -240,17 +232,18 @@ void reader::impl::ingest_raw_input(size_t range_offset, * Sets the uncomp_data_ and uncomp_size_ data members * Loads the data into device memory if byte range parameters are not used */ -void reader::impl::decompress_input(rmm::cuda_stream_view stream) +void reader::impl::decompress_input(std::vector const& buffer, + rmm::cuda_stream_view stream) { if (options_.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy - uncomp_data_ = reinterpret_cast(buffer_.data()); - uncomp_size_ = buffer_.size(); + uncomp_data_ = reinterpret_cast(buffer.data()); + uncomp_size_ = buffer.size(); } else { uncomp_data_owner_ = get_uncompressed_data( // host_span( // - reinterpret_cast(buffer_.data()), - buffer_.size()), + reinterpret_cast(buffer.data()), + buffer.size()), options_.get_compression()); uncomp_data_ = uncomp_data_owner_.data(); @@ -649,10 +642,14 @@ table_with_metadata reader::impl::read(json_reader_options const& options, auto range_size = options.get_byte_range_size(); auto range_size_padded = options.get_byte_range_size_with_padding(); - ingest_raw_input(range_offset, range_size, range_size_padded); - CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n"); + std::vector buffer; + + ingest_raw_input(buffer, range_offset, range_size, range_size_padded); + + CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n"); + + decompress_input(buffer, stream); - decompress_input(stream); CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n"); CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n"); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index d910cce2d72..5e07c38a4c7 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -57,7 +57,6 @@ class reader::impl { rmm::mr::device_memory_resource* mr_ = nullptr; std::vector> sources_; - std::vector buffer_; const char* uncomp_data_ = nullptr; size_t uncomp_size_ = 0; @@ -107,11 +106,15 @@ class reader::impl { * * Sets the source_, byte_range_offset_, and byte_range_size_ data members * + * @param[in] buffer Buffer to read the bytes in to * @param[in] range_offset Number of bytes offset from the start * @param[in] range_size Bytes to read; use `0` for all remaining data * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ - void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded); + void ingest_raw_input(std::vector& buffer, + size_t range_offset, + size_t range_size, + size_t range_size_padded); /** * @brief Extract the JSON objects keys from the input file with object rows. @@ -126,7 +129,7 @@ class reader::impl { * * Sets the uncomp_data_ and uncomp_size_ data members */ - void decompress_input(rmm::cuda_stream_view stream); + void decompress_input(std::vector const& buffer, rmm::cuda_stream_view stream); /** * @brief Finds all record starts in the file. From dc236348b141ad8878ab07c49f9760037101a1c8 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 21:25:09 -0500 Subject: [PATCH 07/32] replace josn reader sources member with local variable --- cpp/include/cudf/io/detail/json.hpp | 25 +++++++--------------- cpp/src/io/functions.cpp | 5 ++--- cpp/src/io/json/reader_impl.cu | 32 +++++++++++++++-------------- cpp/src/io/json/reader_impl.hpp | 13 ++++++------ 4 files changed, 33 insertions(+), 42 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index e6d8f2de483..f39f42626bc 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -46,29 +46,14 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * - * @param sources Input `datasource` objects to read the dataset from * @param options Settings for controlling reading behavior * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit reader(std::vector>&& sources, - json_reader_options const& options, + explicit reader(json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); @@ -77,13 +62,17 @@ class reader { */ ~reader(); - /* + /** * @brief Reads and returns the entire data set. * + * @param[in] sources Input `datasource` objects to read the dataset from * @param[in] options Settings for controlling reading behavior + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * * @return cudf::table object that contains the array of cudf::column. */ - table_with_metadata read(json_reader_options const& options, + table_with_metadata read(std::vector>& sources, + json_reader_options const& options, rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 438cb1762c6..b4a0ae2761f 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -193,10 +193,9 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor options.get_byte_range_offset(), options.get_byte_range_size_with_padding()); - auto reader = - std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); + auto reader = std::make_unique(options, rmm::cuda_stream_default, mr); - return reader->read(options); + return reader->read(datasources, options); } table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index cfda7bb11dc..93c68752d2c 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -199,22 +199,22 @@ std::pair, col_map_ptr_type> reader::impl::get_json_obj create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; } -void reader::impl::ingest_raw_input(std::vector& buffer, +void reader::impl::ingest_raw_input(std::vector> const& sources, + std::vector& buffer, size_t range_offset, size_t range_size, size_t range_size_padded) { // Iterate through the user defined sources and read the contents into the local buffer - CUDF_EXPECTS(!sources_.empty(), "No sources were defined"); size_t total_source_size = 0; - for (const auto& source : sources_) { + for (const auto& source : sources) { total_source_size += source->size(); } - total_source_size = total_source_size - range_offset; + total_source_size = total_source_size - (range_offset * sources.size()); buffer.resize(total_source_size); size_t bytes_read = 0; - for (const auto& source : sources_) { + for (const auto& source : sources) { if (!source->is_empty()) { auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); bytes_read += source->host_read(range_offset, data_size, &buffer[bytes_read]); @@ -611,11 +611,10 @@ table_with_metadata reader::impl::convert_data_to_table(device_span(std::move(out_columns)), metadata_}; } -reader::impl::impl(std::vector>&& sources, - json_reader_options const& options, +reader::impl::impl(json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : options_(options), mr_(mr), sources_(std::move(sources)) + : options_(options), mr_(mr) { CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); @@ -635,7 +634,8 @@ reader::impl::impl(std::vector>&& sources, * * @return Table and its metadata */ -table_with_metadata reader::impl::read(json_reader_options const& options, +table_with_metadata reader::impl::read(std::vector>& sources, + json_reader_options const& options, rmm::cuda_stream_view stream) { auto range_offset = options.get_byte_range_offset(); @@ -644,7 +644,7 @@ table_with_metadata reader::impl::read(json_reader_options const& options, std::vector buffer; - ingest_raw_input(buffer, range_offset, range_size, range_size_padded); + ingest_raw_input(sources, buffer, range_offset, range_size, range_size_padded); CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n"); @@ -669,21 +669,23 @@ table_with_metadata reader::impl::read(json_reader_options const& options, } // Forward to implementation -reader::reader(std::vector>&& sources, - json_reader_options const& options, +reader::reader(json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - _impl = std::make_unique(std::move(sources), options, stream, mr); + _impl = std::make_unique(options, stream, mr); } // Destructor within this translation unit reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(json_reader_options const& options, rmm::cuda_stream_view stream) +table_with_metadata reader::read(std::vector>& sources, + json_reader_options const& options, + rmm::cuda_stream_view stream) { - return table_with_metadata{_impl->read(options, stream)}; + CUDF_EXPECTS(not sources.empty(), "No sources were defined"); + return table_with_metadata{_impl->read(sources, options, stream)}; } } // namespace json } // namespace detail diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 5e07c38a4c7..25ff47a8d6a 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -56,8 +56,6 @@ class reader::impl { rmm::mr::device_memory_resource* mr_ = nullptr; - std::vector> sources_; - const char* uncomp_data_ = nullptr; size_t uncomp_size_ = 0; @@ -111,7 +109,8 @@ class reader::impl { * @param[in] range_size Bytes to read; use `0` for all remaining data * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ - void ingest_raw_input(std::vector& buffer, + void ingest_raw_input(std::vector> const& sources, + std::vector& buffer, size_t range_offset, size_t range_size, size_t range_size_padded); @@ -188,20 +187,22 @@ class reader::impl { /** * @brief Constructor from a dataset source with reader options. */ - explicit impl(std::vector>&& sources, - json_reader_options const& options, + explicit impl(json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** * @brief Read an entire set or a subset of data from the source * + * @param[in] sources Input `datasource` objects to read the dataset from * @param[in] options Settings for controlling reading behavior * @param[in] stream CUDA stream used for device memory operations and kernel launches. * * @return Table and its metadata */ - table_with_metadata read(json_reader_options const& options, rmm::cuda_stream_view stream); + table_with_metadata read(std::vector>& sources, + json_reader_options const& options, + rmm::cuda_stream_view stream); }; } // namespace json From 166c4d3c31edc386d737caa792c5d53523010d87 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 21:45:58 -0500 Subject: [PATCH 08/32] delete useless json reader wrapper class --- cpp/include/cudf/io/detail/json.hpp | 49 ++++++-------------- cpp/src/io/functions.cpp | 6 +-- cpp/src/io/json/reader_impl.cu | 69 +++++++++++++---------------- cpp/src/io/json/reader_impl.hpp | 8 ++-- 4 files changed, 48 insertions(+), 84 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index f39f42626bc..3a443b9b3d0 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -38,43 +38,20 @@ namespace detail { namespace json { /** - * @brief Class to read JSON dataset data into columns. + * @brief Reads and returns the entire data set. + * + * @param[in] sources Input `datasource` objects to read the dataset from + * @param[in] options Settings for controlling reading behavior + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + * + * @return cudf::table object that contains the array of cudf::column. */ -class reader { - private: - class impl; - std::unique_ptr _impl; - - public: - /** - * @brief Constructor from an array of datasources - * - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Destructor explicitly-declared to avoid inlined in header - */ - ~reader(); - - /** - * @brief Reads and returns the entire data set. - * - * @param[in] sources Input `datasource` objects to read the dataset from - * @param[in] options Settings for controlling reading behavior - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::table object that contains the array of cudf::column. - */ - table_with_metadata read(std::vector>& sources, - json_reader_options const& options, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); -}; +table_with_metadata read_json( + std::vector>& sources, + json_reader_options const& options, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace json } // namespace detail diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index b4a0ae2761f..db156144a61 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -183,8 +183,6 @@ compression_type infer_compression_type(compression_type compression, source_inf table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr) { - namespace json = cudf::io::detail::json; - CUDF_FUNC_RANGE(); options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); @@ -193,9 +191,7 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor options.get_byte_range_offset(), options.get_byte_range_size_with_padding()); - auto reader = std::make_unique(options, rmm::cuda_stream_default, mr); - - return reader->read(datasources, options); + return detail::json::read_json(datasources, options, rmm::cuda_stream_default, mr); } table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 93c68752d2c..5eab32d68de 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -183,7 +183,7 @@ auto sort_keys_info_by_offset(std::unique_ptr
info) * * @return Names of JSON object keys in the file */ -std::pair, col_map_ptr_type> reader::impl::get_json_object_keys_hashes( +std::pair, col_map_ptr_type> reader_impl::get_json_object_keys_hashes( device_span rec_starts, rmm::cuda_stream_view stream) { auto info = create_json_keys_info_table( @@ -199,11 +199,11 @@ std::pair, col_map_ptr_type> reader::impl::get_json_obj create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; } -void reader::impl::ingest_raw_input(std::vector> const& sources, - std::vector& buffer, - size_t range_offset, - size_t range_size, - size_t range_size_padded) +void reader_impl::ingest_raw_input(std::vector> const& sources, + std::vector& buffer, + size_t range_offset, + size_t range_size, + size_t range_size_padded) { // Iterate through the user defined sources and read the contents into the local buffer size_t total_source_size = 0; @@ -232,8 +232,7 @@ void reader::impl::ingest_raw_input(std::vector> con * Sets the uncomp_data_ and uncomp_size_ data members * Loads the data into device memory if byte range parameters are not used */ -void reader::impl::decompress_input(std::vector const& buffer, - rmm::cuda_stream_view stream) +void reader_impl::decompress_input(std::vector const& buffer, rmm::cuda_stream_view stream) { if (options_.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy @@ -252,7 +251,7 @@ void reader::impl::decompress_input(std::vector const& buffer, if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); } -rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_view stream) +rmm::device_uvector reader_impl::find_record_starts(rmm::cuda_stream_view stream) { std::vector chars_to_count{'\n'}; // Currently, ignoring lineterminations within quotes is handled by recording the records of both, @@ -327,8 +326,8 @@ rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_ * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ -void reader::impl::upload_data_to_device(rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream) +void reader_impl::upload_data_to_device(rmm::device_uvector& rec_starts, + rmm::cuda_stream_view stream) { size_t start_offset = 0; size_t end_offset = uncomp_size_; @@ -366,8 +365,8 @@ void reader::impl::upload_data_to_device(rmm::device_uvector& rec_star data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream); } -void reader::impl::set_column_names(device_span rec_starts, - rmm::cuda_stream_view stream) +void reader_impl::set_column_names(device_span rec_starts, + rmm::cuda_stream_view stream) { // If file only contains one row, use the file size for the row size uint64_t first_row_len = data_.size() / sizeof(char); @@ -417,7 +416,7 @@ void reader::impl::set_column_names(device_span rec_starts, } } -std::vector reader::impl::parse_data_types( +std::vector reader_impl::parse_data_types( std::vector const& types_as_strings) { CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(), @@ -459,8 +458,8 @@ std::vector reader::impl::parse_data_types( return dtypes; } -void reader::impl::set_data_types(device_span rec_starts, - rmm::cuda_stream_view stream) +void reader_impl::set_data_types(device_span rec_starts, + rmm::cuda_stream_view stream) { bool has_to_infer_column_types = std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes()); @@ -528,8 +527,8 @@ void reader::impl::set_data_types(device_span rec_starts, } } -table_with_metadata reader::impl::convert_data_to_table(device_span rec_starts, - rmm::cuda_stream_view stream) +table_with_metadata reader_impl::convert_data_to_table(device_span rec_starts, + rmm::cuda_stream_view stream) { const auto num_columns = dtypes_.size(); const auto num_records = rec_starts.size(); @@ -611,9 +610,9 @@ table_with_metadata reader::impl::convert_data_to_table(device_span(std::move(out_columns)), metadata_}; } -reader::impl::impl(json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +reader_impl::reader_impl(json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) : options_(options), mr_(mr) { CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); @@ -634,9 +633,9 @@ reader::impl::impl(json_reader_options const& options, * * @return Table and its metadata */ -table_with_metadata reader::impl::read(std::vector>& sources, - json_reader_options const& options, - rmm::cuda_stream_view stream) +table_with_metadata reader_impl::read(std::vector>& sources, + json_reader_options const& options, + rmm::cuda_stream_view stream) { auto range_offset = options.get_byte_range_offset(); auto range_size = options.get_byte_range_size(); @@ -668,24 +667,16 @@ table_with_metadata reader::impl::read(std::vector>& return convert_data_to_table(rec_starts, stream); } -// Forward to implementation -reader::reader(json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +table_with_metadata read_json(std::vector>& sources, + json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - _impl = std::make_unique(options, stream, mr); -} + CUDF_EXPECTS(not sources.empty(), "No sources were defined"); -// Destructor within this translation unit -reader::~reader() = default; + auto impl = std::make_unique(options, stream, mr); -// Forward to implementation -table_with_metadata reader::read(std::vector>& sources, - json_reader_options const& options, - rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS(not sources.empty(), "No sources were defined"); - return table_with_metadata{_impl->read(sources, options, stream)}; + return table_with_metadata{impl->read(sources, options, stream)}; } } // namespace json } // namespace detail diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 25ff47a8d6a..bdeaa81ba78 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -49,7 +49,7 @@ using col_map_ptr_type = std::unique_ptr Date: Thu, 19 Aug 2021 21:47:19 -0500 Subject: [PATCH 09/32] delete unused arrow namespace declaration --- cpp/include/cudf/io/detail/json.hpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 3a443b9b3d0..2417798d4af 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -25,13 +25,6 @@ #include -// Forward declarations -namespace arrow { -namespace io { -class RandomAccessFile; -} -} // namespace arrow - namespace cudf { namespace io { namespace detail { From 5a997b841cbe7f836b1300d796033f3dfa5551ec Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 21:51:51 -0500 Subject: [PATCH 10/32] replace json reader_impl mr member with argument --- cpp/include/cudf/io/detail/json.hpp | 2 +- cpp/src/io/json/reader_impl.cu | 24 ++++++++++++------------ cpp/src/io/json/reader_impl.hpp | 14 +++++++------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 2417798d4af..7ab8906e5a9 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -36,7 +36,7 @@ namespace json { * @param[in] sources Input `datasource` objects to read the dataset from * @param[in] options Settings for controlling reading behavior * @param[in] stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation + * @param[in] mr Device memory resource to use for device memory allocation * * @return cudf::table object that contains the array of cudf::column. */ diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 5eab32d68de..c2b1d5ed824 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -528,7 +528,8 @@ void reader_impl::set_data_types(device_span rec_starts, } table_with_metadata reader_impl::convert_data_to_table(device_span rec_starts, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { const auto num_columns = dtypes_.size(); const auto num_records = rec_starts.size(); @@ -536,7 +537,7 @@ table_with_metadata reader_impl::convert_data_to_table(device_span out_buffers; for (size_t col = 0; col < num_columns; ++col) { - out_buffers.emplace_back(dtypes_[col], num_records, true, stream, mr_); + out_buffers.emplace_back(dtypes_[col], num_records, true, stream, mr); } thrust::host_vector h_dtypes(num_columns); @@ -591,11 +592,11 @@ table_with_metadata reader_impl::convert_data_to_table(device_spantype().id() == type_id::STRING) { // Need to remove escape character in case of '\"' and '\\' out_columns.emplace_back(cudf::strings::detail::replace( - out_column->view(), target->view(), repl->view(), stream, mr_)); + out_column->view(), target->view(), repl->view(), stream, mr)); } else { out_columns.emplace_back(std::move(out_column)); } @@ -610,10 +611,8 @@ table_with_metadata reader_impl::convert_data_to_table(device_span(std::move(out_columns)), metadata_}; } -reader_impl::reader_impl(json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : options_(options), mr_(mr) +reader_impl::reader_impl(json_reader_options const& options, rmm::cuda_stream_view stream) + : options_(options) { CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); @@ -635,7 +634,8 @@ reader_impl::reader_impl(json_reader_options const& options, */ table_with_metadata reader_impl::read(std::vector>& sources, json_reader_options const& options, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto range_offset = options.get_byte_range_offset(); auto range_size = options.get_byte_range_size(); @@ -664,7 +664,7 @@ table_with_metadata reader_impl::read(std::vector>& set_data_types(rec_starts, stream); CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n"); - return convert_data_to_table(rec_starts, stream); + return convert_data_to_table(rec_starts, stream, mr); } table_with_metadata read_json(std::vector>& sources, @@ -674,9 +674,9 @@ table_with_metadata read_json(std::vector> { CUDF_EXPECTS(not sources.empty(), "No sources were defined"); - auto impl = std::make_unique(options, stream, mr); + auto impl = std::make_unique(options, stream); - return table_with_metadata{impl->read(sources, options, stream)}; + return table_with_metadata{impl->read(sources, options, stream, mr)}; } } // namespace json } // namespace detail diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index bdeaa81ba78..4498b48741d 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -54,8 +54,6 @@ class reader_impl { private: const json_reader_options options_{}; - rmm::mr::device_memory_resource* mr_ = nullptr; - const char* uncomp_data_ = nullptr; size_t uncomp_size_ = 0; @@ -177,19 +175,19 @@ class reader_impl { * * @param[in] rec_starts Record starts in device memory * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource to use for device memory allocation * * @return Table and its metadata */ table_with_metadata convert_data_to_table(device_span rec_starts, - rmm::cuda_stream_view stream); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); public: /** * @brief Constructor from a dataset source with reader options. */ - explicit reader_impl(json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); + explicit reader_impl(json_reader_options const& options, rmm::cuda_stream_view stream); /** * @brief Read an entire set or a subset of data from the source @@ -197,12 +195,14 @@ class reader_impl { * @param[in] sources Input `datasource` objects to read the dataset from * @param[in] options Settings for controlling reading behavior * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource to use for device memory allocation * * @return Table and its metadata */ table_with_metadata read(std::vector>& sources, json_reader_options const& options, - rmm::cuda_stream_view stream); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); }; } // namespace json From 89163fb5b5748975b2d51a004c93a3489854abfe Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 22:07:46 -0500 Subject: [PATCH 11/32] replace json reader_impl options with local variable --- cpp/src/io/json/reader_impl.cu | 55 ++++++++++++++++----------------- cpp/src/io/json/reader_impl.hpp | 16 +++++----- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index c2b1d5ed824..9a1ee49a7de 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -111,7 +111,7 @@ col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, * * @return std::unique_ptr
cudf table with three columns (offsets, lengths, hashes) */ -std::unique_ptr
create_json_keys_info_table(const parse_options_view& options, +std::unique_ptr
create_json_keys_info_table(parse_options_view const& parse_opts, device_span const data, device_span const row_offsets, rmm::cuda_stream_view stream) @@ -119,7 +119,7 @@ std::unique_ptr
create_json_keys_info_table(const parse_options_view& opt // Count keys rmm::device_scalar key_counter(0, stream); cudf::io::json::gpu::collect_keys_info( - options, data, row_offsets, key_counter.data(), {}, stream); + parse_opts, data, row_offsets, key_counter.data(), {}, stream); // Allocate columns to store hash value, length, and offset of each JSON object key in the input auto const num_keys = key_counter.value(stream); @@ -135,7 +135,7 @@ std::unique_ptr
create_json_keys_info_table(const parse_options_view& opt key_counter.set_value_to_zero_async(stream); // Fill the allocated columns cudf::io::json::gpu::collect_keys_info( - options, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream); + parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream); return info_table; } @@ -232,9 +232,11 @@ void reader_impl::ingest_raw_input(std::vector> cons * Sets the uncomp_data_ and uncomp_size_ data members * Loads the data into device memory if byte range parameters are not used */ -void reader_impl::decompress_input(std::vector const& buffer, rmm::cuda_stream_view stream) +void reader_impl::decompress_input(json_reader_options const& read_opts, + std::vector const& buffer, + rmm::cuda_stream_view stream) { - if (options_.get_compression() == compression_type::NONE) { + if (read_opts.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy uncomp_data_ = reinterpret_cast(buffer.data()); uncomp_size_ = buffer.size(); @@ -243,7 +245,7 @@ void reader_impl::decompress_input(std::vector const& buffer, rmm::cuda host_span( // reinterpret_cast(buffer.data()), buffer.size()), - options_.get_compression()); + read_opts.get_compression()); uncomp_data_ = uncomp_data_owner_.data(); uncomp_size_ = uncomp_data_owner_.size(); @@ -458,11 +460,12 @@ std::vector reader_impl::parse_data_types( return dtypes; } -void reader_impl::set_data_types(device_span rec_starts, +void reader_impl::set_data_types(json_reader_options const& reader_opts, + device_span rec_starts, rmm::cuda_stream_view stream) { bool has_to_infer_column_types = - std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes()); + std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); if (!has_to_infer_column_types) { dtypes_ = std::visit( cudf::detail::visitor_overload{ @@ -480,7 +483,7 @@ void reader_impl::set_data_types(device_span rec_starts, return sorted_dtypes; }, [&](std::vector const& dtypes) { return parse_data_types(dtypes); }}, - options_.get_dtypes()); + reader_opts.get_dtypes()); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = metadata_.column_names.size(); @@ -611,18 +614,6 @@ table_with_metadata reader_impl::convert_data_to_table(device_span(std::move(out_columns)), metadata_}; } -reader_impl::reader_impl(json_reader_options const& options, rmm::cuda_stream_view stream) - : options_(options) -{ - CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); - - opts_.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - opts_.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - opts_.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - - opts_.dayfirst = options.is_enabled_dayfirst(); -} - /** * @brief Read an entire set or a subset of data from the source * @@ -633,13 +624,21 @@ reader_impl::reader_impl(json_reader_options const& options, rmm::cuda_stream_vi * @return Table and its metadata */ table_with_metadata reader_impl::read(std::vector>& sources, - json_reader_options const& options, + json_reader_options const& reader_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto range_offset = options.get_byte_range_offset(); - auto range_size = options.get_byte_range_size(); - auto range_size_padded = options.get_byte_range_size_with_padding(); + CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); + + opts_.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + opts_.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + opts_.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + opts_.dayfirst = reader_opts.is_enabled_dayfirst(); + + auto range_offset = reader_opts.get_byte_range_offset(); + auto range_size = reader_opts.get_byte_range_size(); + auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); std::vector buffer; @@ -647,7 +646,7 @@ table_with_metadata reader_impl::read(std::vector>& CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n"); - decompress_input(buffer, stream); + decompress_input(reader_opts, buffer, stream); CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n"); CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n"); @@ -661,7 +660,7 @@ table_with_metadata reader_impl::read(std::vector>& set_column_names(rec_starts, stream); CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n"); - set_data_types(rec_starts, stream); + set_data_types(reader_opts, rec_starts, stream); CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n"); return convert_data_to_table(rec_starts, stream, mr); @@ -674,7 +673,7 @@ table_with_metadata read_json(std::vector> { CUDF_EXPECTS(not sources.empty(), "No sources were defined"); - auto impl = std::make_unique(options, stream); + auto impl = std::make_unique(); return table_with_metadata{impl->read(sources, options, stream, mr)}; } diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 4498b48741d..6cfc85d880e 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -52,8 +52,6 @@ using col_map_ptr_type = std::unique_ptr const& buffer, rmm::cuda_stream_view stream); + void decompress_input(json_reader_options const& options, + std::vector const& buffer, + rmm::cuda_stream_view stream); /** * @brief Finds all record starts in the file. @@ -165,10 +165,13 @@ class reader_impl { * * If user does not pass the data types, deduces types from the file content * + * @param[in] reader_opts Settings for controlling reading behavior * @param[in] rec_starts Record starts in device memory * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ - void set_data_types(device_span rec_starts, rmm::cuda_stream_view stream); + void set_data_types(json_reader_options const& reader_opts, + device_span rec_starts, + rmm::cuda_stream_view stream); /** * @brief Parse the input data and store results a table @@ -184,11 +187,6 @@ class reader_impl { rmm::mr::device_memory_resource* mr); public: - /** - * @brief Constructor from a dataset source with reader options. - */ - explicit reader_impl(json_reader_options const& options, rmm::cuda_stream_view stream); - /** * @brief Read an entire set or a subset of data from the source * From 7ff3f1a9264e1b15c2e81eb7a4ff3b824567f0c3 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 22:11:51 -0500 Subject: [PATCH 12/32] remove unused json_reader::allow_newlines_in_strings_ member --- cpp/src/io/json/reader_impl.cu | 23 ----------------------- cpp/src/io/json/reader_impl.hpp | 1 - 2 files changed, 24 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 9a1ee49a7de..d1ca067aaf5 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -259,7 +259,6 @@ rmm::device_uvector reader_impl::find_record_starts(rmm::cuda_stream_v // Currently, ignoring lineterminations within quotes is handled by recording the records of both, // and then filtering out the records that is a quotechar or a linetermination within a quotechar // pair. - if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); } // If not starting at an offset, add an extra row to account for the first row in the file cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0); if (load_whole_source_) { @@ -278,7 +277,6 @@ rmm::device_uvector reader_impl::find_record_starts(rmm::cuda_stream_v } std::vector chars_to_find{'\n'}; - if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); } // Passing offset = 1 to return positions AFTER the found character if (load_whole_source_) { find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream); @@ -292,27 +290,6 @@ rmm::device_uvector reader_impl::find_record_starts(rmm::cuda_stream_v thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end()); auto filtered_count = prefilter_count; - if (allow_newlines_in_strings_) { - auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream); - bool quotation = false; - for (cudf::size_type i = 1; i < prefilter_count; ++i) { - if (uncomp_data_[h_rec_starts[i] - 1] == '\"') { - quotation = !quotation; - h_rec_starts[i] = uncomp_size_; - filtered_count--; - } else if (quotation) { - h_rec_starts[i] = uncomp_size_; - filtered_count--; - } - } - CUDA_TRY(cudaMemcpyAsync(rec_starts.data(), - h_rec_starts.data(), - h_rec_starts.size() * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); - thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end()); - stream.synchronize(); - } // Exclude the ending newline as it does not precede a record start if (uncomp_data_[uncomp_size_ - 1] == '\n') { filtered_count--; } diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 6cfc85d880e..3923890f583 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -72,7 +72,6 @@ class reader_impl { std::unique_ptr> d_key_col_map_; // parsing options - const bool allow_newlines_in_strings_ = false; parse_options opts_{',', '\n', '\"', '.'}; /** From 5c95398ba19a82910a2d6fd946ce54d86443708c Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 22:22:42 -0500 Subject: [PATCH 13/32] replace json reader_impl::opts_ with local variable --- cpp/src/io/json/reader_impl.cu | 40 ++++++++++++++++++++------------- cpp/src/io/json/reader_impl.hpp | 15 ++++++++----- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index d1ca067aaf5..25fd17894a4 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -184,10 +184,12 @@ auto sort_keys_info_by_offset(std::unique_ptr
info) * @return Names of JSON object keys in the file */ std::pair, col_map_ptr_type> reader_impl::get_json_object_keys_hashes( - device_span rec_starts, rmm::cuda_stream_view stream) + parse_options_view const& parse_opts, + device_span rec_starts, + rmm::cuda_stream_view stream) { auto info = create_json_keys_info_table( - opts_.view(), + parse_opts, device_span(static_cast(data_.data()), data_.size()), rec_starts, stream); @@ -344,7 +346,8 @@ void reader_impl::upload_data_to_device(rmm::device_uvector& rec_start data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream); } -void reader_impl::set_column_names(device_span rec_starts, +void reader_impl::set_column_names(parse_options_view const& parse_opts, + device_span rec_starts, rmm::cuda_stream_view stream) { // If file only contains one row, use the file size for the row size @@ -376,7 +379,7 @@ void reader_impl::set_column_names(device_span rec_starts, // If the first opening bracket is '{', assume object format if (first_curly_bracket < first_square_bracket) { // use keys as column names if input rows are objects - auto keys_desc = get_json_object_keys_hashes(rec_starts, stream); + auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, stream); metadata_.column_names = keys_desc.first; set_column_map(std::move(keys_desc.second), stream); } else { @@ -384,11 +387,12 @@ void reader_impl::set_column_names(device_span rec_starts, bool quotation = false; for (size_t pos = 0; pos < first_row.size(); ++pos) { // Flip the quotation flag if current character is a quotechar - if (first_row[pos] == opts_.quotechar) { + if (first_row[pos] == parse_opts.quotechar) { quotation = !quotation; } // Check if end of a column/row - else if (pos == first_row.size() - 1 || (!quotation && first_row[pos] == opts_.delimiter)) { + else if (pos == first_row.size() - 1 || + (!quotation && first_row[pos] == parse_opts.delimiter)) { metadata_.column_names.emplace_back(std::to_string(cols_found++)); } } @@ -438,6 +442,7 @@ std::vector reader_impl::parse_data_types( } void reader_impl::set_data_types(json_reader_options const& reader_opts, + parse_options_view const& parse_opts, device_span rec_starts, rmm::cuda_stream_view stream) { @@ -467,7 +472,7 @@ void reader_impl::set_data_types(json_reader_options const& reader_opts, auto const do_set_null_count = key_to_col_idx_map_ != nullptr; auto const h_column_infos = cudf::io::json::gpu::detect_data_types( - opts_.view(), + parse_opts, device_span(static_cast(data_.data()), data_.size()), rec_starts, do_set_null_count, @@ -507,7 +512,8 @@ void reader_impl::set_data_types(json_reader_options const& reader_opts, } } -table_with_metadata reader_impl::convert_data_to_table(device_span rec_starts, +table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts, + device_span rec_starts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -537,7 +543,7 @@ table_with_metadata reader_impl::convert_data_to_table(device_span(num_columns, stream); cudf::io::json::gpu::convert_json_to_columns( - opts_.view(), + parse_opts, device_span(static_cast(data_.data()), data_.size()), rec_starts, d_dtypes, @@ -607,11 +613,13 @@ table_with_metadata reader_impl::read(std::vector>& { CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); - opts_.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - opts_.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - opts_.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + auto parse_opts = parse_options{',', '\n', '\"', '.'}; - opts_.dayfirst = reader_opts.is_enabled_dayfirst(); + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + parse_opts.dayfirst = reader_opts.is_enabled_dayfirst(); auto range_offset = reader_opts.get_byte_range_offset(); auto range_size = reader_opts.get_byte_range_size(); @@ -634,13 +642,13 @@ table_with_metadata reader_impl::read(std::vector>& upload_data_to_device(rec_starts, stream); CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n"); - set_column_names(rec_starts, stream); + set_column_names(parse_opts.view(), rec_starts, stream); CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n"); - set_data_types(reader_opts, rec_starts, stream); + set_data_types(reader_opts, parse_opts.view(), rec_starts, stream); CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n"); - return convert_data_to_table(rec_starts, stream, mr); + return convert_data_to_table(parse_opts.view(), rec_starts, stream, mr); } table_with_metadata read_json(std::vector>& sources, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 3923890f583..3444f33bc62 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -71,9 +71,6 @@ class reader_impl { col_map_ptr_type key_to_col_idx_map_; std::unique_ptr> d_key_col_map_; - // parsing options - parse_options opts_{',', '\n', '\"', '.'}; - /** * @brief Sets the column map data member and makes a device copy to be used as a kernel * parameter. @@ -116,7 +113,9 @@ class reader_impl { * @return Array of keys and a map that maps their hash values to column indices */ std::pair, col_map_ptr_type> get_json_object_keys_hashes( - device_span rec_starts, rmm::cuda_stream_view stream); + parse_options_view const& parse_opts, + device_span rec_starts, + rmm::cuda_stream_view stream); /** * @brief Decompress the input data, if needed @@ -155,7 +154,9 @@ class reader_impl { * @param[in] rec_starts Record starts in device memory * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ - void set_column_names(device_span rec_starts, rmm::cuda_stream_view stream); + void set_column_names(parse_options_view const& parse_opts, + device_span rec_starts, + rmm::cuda_stream_view stream); std::vector parse_data_types(std::vector const& types_as_strings); @@ -169,6 +170,7 @@ class reader_impl { * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ void set_data_types(json_reader_options const& reader_opts, + parse_options_view const& parse_opts, device_span rec_starts, rmm::cuda_stream_view stream); @@ -181,7 +183,8 @@ class reader_impl { * * @return Table and its metadata */ - table_with_metadata convert_data_to_table(device_span rec_starts, + table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, + device_span rec_starts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); From 0bde0b1ce6de927da911b05052ac6d8d4c30cc7b Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 22:34:16 -0500 Subject: [PATCH 14/32] remove json reader_impl byte_range members in place of local variables --- cpp/src/io/json/reader_impl.cu | 42 +++++++++++++++++++-------------- cpp/src/io/json/reader_impl.hpp | 10 ++++---- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 25fd17894a4..c5f08797aea 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -222,10 +222,12 @@ void reader_impl::ingest_raw_input(std::vector> cons bytes_read += source->host_read(range_offset, data_size, &buffer[bytes_read]); } } +} - byte_range_offset_ = range_offset; - byte_range_size_ = range_size; - load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0; +bool should_load_whole_source(json_reader_options const& reader_opts) +{ + return reader_opts.get_byte_range_offset() == 0 and // + reader_opts.get_byte_range_size() == 0; } /** @@ -234,11 +236,11 @@ void reader_impl::ingest_raw_input(std::vector> cons * Sets the uncomp_data_ and uncomp_size_ data members * Loads the data into device memory if byte range parameters are not used */ -void reader_impl::decompress_input(json_reader_options const& read_opts, +void reader_impl::decompress_input(json_reader_options const& reader_opts, std::vector const& buffer, rmm::cuda_stream_view stream) { - if (read_opts.get_compression() == compression_type::NONE) { + if (reader_opts.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy uncomp_data_ = reinterpret_cast(buffer.data()); uncomp_size_ = buffer.size(); @@ -247,23 +249,26 @@ void reader_impl::decompress_input(json_reader_options const& read_opts, host_span( // reinterpret_cast(buffer.data()), buffer.size()), - read_opts.get_compression()); + reader_opts.get_compression()); uncomp_data_ = uncomp_data_owner_.data(); uncomp_size_ = uncomp_data_owner_.size(); } - if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); + if (should_load_whole_source(reader_opts)) { + data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); + } } -rmm::device_uvector reader_impl::find_record_starts(rmm::cuda_stream_view stream) +rmm::device_uvector reader_impl::find_record_starts( + json_reader_options const& reader_opts, rmm::cuda_stream_view stream) { std::vector chars_to_count{'\n'}; // Currently, ignoring lineterminations within quotes is handled by recording the records of both, // and then filtering out the records that is a quotechar or a linetermination within a quotechar // pair. // If not starting at an offset, add an extra row to account for the first row in the file - cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0); - if (load_whole_source_) { + cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0); + if (should_load_whole_source(reader_opts)) { prefilter_count += count_all_from_set(data_, chars_to_count, stream); } else { prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream); @@ -273,14 +278,14 @@ rmm::device_uvector reader_impl::find_record_starts(rmm::cuda_stream_v auto* find_result_ptr = rec_starts.data(); // Manually adding an extra row to account for the first row in the file - if (byte_range_offset_ == 0) { + if (reader_opts.get_byte_range_offset() == 0) { find_result_ptr++; CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value())); } std::vector chars_to_find{'\n'}; // Passing offset = 1 to return positions AFTER the found character - if (load_whole_source_) { + if (should_load_whole_source(reader_opts)) { find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream); } else { find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream); @@ -307,19 +312,20 @@ rmm::device_uvector reader_impl::find_record_starts(rmm::cuda_stream_v * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ -void reader_impl::upload_data_to_device(rmm::device_uvector& rec_starts, +void reader_impl::upload_data_to_device(json_reader_options const& reader_opts, + rmm::device_uvector& rec_starts, rmm::cuda_stream_view stream) { size_t start_offset = 0; size_t end_offset = uncomp_size_; // Trim lines that are outside range - if (byte_range_size_ != 0 || byte_range_offset_ != 0) { + if (reader_opts.get_byte_range_size() != 0 || reader_opts.get_byte_range_offset() != 0) { auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream); - if (byte_range_size_ != 0) { + if (reader_opts.get_byte_range_size() != 0) { auto it = h_rec_starts.end() - 1; - while (it >= h_rec_starts.begin() && *it > byte_range_size_) { + while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) { end_offset = *it; --it; } @@ -636,10 +642,10 @@ table_with_metadata reader_impl::read(std::vector>& CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n"); CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n"); - auto rec_starts = find_record_starts(stream); + auto rec_starts = find_record_starts(reader_opts, stream); CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n"); - upload_data_to_device(rec_starts, stream); + upload_data_to_device(reader_opts, rec_starts, stream); CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n"); set_column_names(parse_opts.view(), rec_starts, stream); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 3444f33bc62..1a4eb282b38 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -59,10 +59,6 @@ class reader_impl { std::vector uncomp_data_owner_; rmm::device_buffer data_; - size_t byte_range_offset_ = 0; - size_t byte_range_size_ = 0; - bool load_whole_source_ = true; - table_metadata metadata_; std::vector dtypes_; @@ -134,7 +130,8 @@ class reader_impl { * @param[in] stream CUDA stream used for device memory operations and kernel launches. * @return Record starts in the device memory */ - rmm::device_uvector find_record_starts(rmm::cuda_stream_view stream); + rmm::device_uvector find_record_starts(json_reader_options const& reader_opts, + rmm::cuda_stream_view stream); /** * @brief Uploads the relevant segment of the input json data onto the GPU. @@ -143,7 +140,8 @@ class reader_impl { * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ - void upload_data_to_device(rmm::device_uvector& rec_starts, + void upload_data_to_device(json_reader_options const& reader_opts, + rmm::device_uvector& rec_starts, rmm::cuda_stream_view stream); /** From b4843f5e9edec7ed88d7dda1420ebccc5ca4c246 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 22:58:24 -0500 Subject: [PATCH 15/32] replace json reader_impl::dtypes_ with local variable --- cpp/src/io/json/reader_impl.cu | 31 +++++++++++++++++++------------ cpp/src/io/json/reader_impl.hpp | 10 +++++----- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index c5f08797aea..0eaebd04008 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -447,15 +447,16 @@ std::vector reader_impl::parse_data_types( return dtypes; } -void reader_impl::set_data_types(json_reader_options const& reader_opts, - parse_options_view const& parse_opts, - device_span rec_starts, - rmm::cuda_stream_view stream) +std::vector reader_impl::get_data_types(json_reader_options const& reader_opts, + parse_options_view const& parse_opts, + device_span rec_starts, + rmm::cuda_stream_view stream) { bool has_to_infer_column_types = std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); + if (!has_to_infer_column_types) { - dtypes_ = std::visit( + return std::visit( cudf::detail::visitor_overload{ [&](const std::vector& dtypes) { return dtypes; }, [&](const std::map& dtypes) { @@ -511,25 +512,30 @@ void reader_impl::set_data_types(json_reader_options const& reader_opts, } }; + std::vector dtypes; + std::transform(std::cbegin(h_column_infos), std::cend(h_column_infos), - std::back_inserter(dtypes_), + std::back_inserter(dtypes), [&](auto const& cinfo) { return data_type{get_type_id(cinfo)}; }); + + return dtypes; } } table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts, + std::vector const& dtypes, device_span rec_starts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - const auto num_columns = dtypes_.size(); + const auto num_columns = dtypes.size(); const auto num_records = rec_starts.size(); // alloc output buffers. std::vector out_buffers; for (size_t col = 0; col < num_columns; ++col) { - out_buffers.emplace_back(dtypes_[col], num_records, true, stream, mr); + out_buffers.emplace_back(dtypes[col], num_records, true, stream, mr); } thrust::host_vector h_dtypes(num_columns); @@ -537,7 +543,7 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& thrust::host_vector h_valid(num_columns); for (size_t i = 0; i < num_columns; ++i) { - h_dtypes[i] = dtypes_[i]; + h_dtypes[i] = dtypes[i]; h_data[i] = out_buffers[i].data(); h_valid[i] = out_buffers[i].null_mask(); } @@ -651,10 +657,11 @@ table_with_metadata reader_impl::read(std::vector>& set_column_names(parse_opts.view(), rec_starts, stream); CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n"); - set_data_types(reader_opts, parse_opts.view(), rec_starts, stream); - CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n"); + auto dtypes = get_data_types(reader_opts, parse_opts.view(), rec_starts, stream); + + CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); - return convert_data_to_table(parse_opts.view(), rec_starts, stream, mr); + return convert_data_to_table(parse_opts.view(), dtypes, rec_starts, stream, mr); } table_with_metadata read_json(std::vector>& sources, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 1a4eb282b38..8d4900325b8 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -60,7 +60,6 @@ class reader_impl { rmm::device_buffer data_; table_metadata metadata_; - std::vector dtypes_; // the map is only used for files with rows in object format; initialize to a dummy value so the // map object can be passed to the kernel in any case @@ -167,10 +166,10 @@ class reader_impl { * @param[in] rec_starts Record starts in device memory * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ - void set_data_types(json_reader_options const& reader_opts, - parse_options_view const& parse_opts, - device_span rec_starts, - rmm::cuda_stream_view stream); + std::vector get_data_types(json_reader_options const& reader_opts, + parse_options_view const& parse_opts, + device_span rec_starts, + rmm::cuda_stream_view stream); /** * @brief Parse the input data and store results a table @@ -182,6 +181,7 @@ class reader_impl { * @return Table and its metadata */ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, + std::vector const& dtypes, device_span rec_starts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); From f511e6800cd0b58b68fcb8bdf3113eca5cea8413 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 16:55:20 -0500 Subject: [PATCH 16/32] replace json reader_impl::metadata_ with local variable --- cpp/src/io/json/reader_impl.cu | 77 ++++++++++++++++++--------------- cpp/src/io/json/reader_impl.hpp | 13 +++--- 2 files changed, 49 insertions(+), 41 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 0eaebd04008..65179866595 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -352,9 +352,9 @@ void reader_impl::upload_data_to_device(json_reader_options const& reader_opts, data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream); } -void reader_impl::set_column_names(parse_options_view const& parse_opts, - device_span rec_starts, - rmm::cuda_stream_view stream) +std::vector reader_impl::get_column_names(parse_options_view const& parse_opts, + device_span rec_starts, + rmm::cuda_stream_view stream) { // If file only contains one row, use the file size for the row size uint64_t first_row_len = data_.size() / sizeof(char); @@ -385,12 +385,13 @@ void reader_impl::set_column_names(parse_options_view const& parse_opts, // If the first opening bracket is '{', assume object format if (first_curly_bracket < first_square_bracket) { // use keys as column names if input rows are objects - auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, stream); - metadata_.column_names = keys_desc.first; + auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, stream); set_column_map(std::move(keys_desc.second), stream); + return keys_desc.first; } else { - int cols_found = 0; - bool quotation = false; + int cols_found = 0; + bool quotation = false; + auto column_names = std::vector(); for (size_t pos = 0; pos < first_row.size(); ++pos) { // Flip the quotation flag if current character is a quotechar if (first_row[pos] == parse_opts.quotechar) { @@ -399,16 +400,17 @@ void reader_impl::set_column_names(parse_options_view const& parse_opts, // Check if end of a column/row else if (pos == first_row.size() - 1 || (!quotation && first_row[pos] == parse_opts.delimiter)) { - metadata_.column_names.emplace_back(std::to_string(cols_found++)); + column_names.emplace_back(std::to_string(cols_found++)); } } + return column_names; } } std::vector reader_impl::parse_data_types( - std::vector const& types_as_strings) + std::vector const& column_names, std::vector const& types_as_strings) { - CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(), + CUDF_EXPECTS(types_as_strings.size() == column_names.size(), "Need to specify the type of each column.\n"); std::vector dtypes; // Assume that the dtype is in dictionary format only if all elements contain a colon @@ -434,8 +436,8 @@ std::vector reader_impl::parse_data_types( }); // Using the map here allows O(n log n) complexity - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), + std::transform(std::cbegin(column_names), + std::cend(column_names), std::back_inserter(dtypes), [&](auto const& column_name) { return col_type_map[column_name]; }); } else { @@ -449,6 +451,7 @@ std::vector reader_impl::parse_data_types( std::vector reader_impl::get_data_types(json_reader_options const& reader_opts, parse_options_view const& parse_opts, + std::vector const& column_names, device_span rec_starts, rmm::cuda_stream_view stream) { @@ -456,26 +459,28 @@ std::vector reader_impl::get_data_types(json_reader_options const& re std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); if (!has_to_infer_column_types) { - return std::visit( - cudf::detail::visitor_overload{ - [&](const std::vector& dtypes) { return dtypes; }, - [&](const std::map& dtypes) { - std::vector sorted_dtypes; - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), - std::back_inserter(sorted_dtypes), - [&](auto const& column_name) { - auto const it = dtypes.find(column_name); - CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); - return it->second; - }); - return sorted_dtypes; - }, - [&](std::vector const& dtypes) { return parse_data_types(dtypes); }}, - reader_opts.get_dtypes()); + return std::visit(cudf::detail::visitor_overload{ + [&](const std::vector& dtypes) { return dtypes; }, + [&](const std::map& dtypes) { + std::vector sorted_dtypes; + std::transform(std::cbegin(column_names), + std::cend(column_names), + std::back_inserter(sorted_dtypes), + [&](auto const& column_name) { + auto const it = dtypes.find(column_name); + CUDF_EXPECTS(it != dtypes.end(), + "Must specify types for all columns"); + return it->second; + }); + return sorted_dtypes; + }, + [&](std::vector const& dtypes) { + return parse_data_types(column_names, dtypes); + }}, + reader_opts.get_dtypes()); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); - auto const num_columns = metadata_.column_names.size(); + auto const num_columns = column_names.size(); auto const do_set_null_count = key_to_col_idx_map_ != nullptr; auto const h_column_infos = cudf::io::json::gpu::detect_data_types( @@ -525,6 +530,7 @@ std::vector reader_impl::get_data_types(json_reader_options const& re table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts, std::vector const& dtypes, + std::vector const& column_names, device_span rec_starts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -606,7 +612,7 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input"); - return table_with_metadata{std::make_unique
(std::move(out_columns)), metadata_}; + return table_with_metadata{std::make_unique
(std::move(out_columns)), {column_names}}; } /** @@ -654,14 +660,15 @@ table_with_metadata reader_impl::read(std::vector>& upload_data_to_device(reader_opts, rec_starts, stream); CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n"); - set_column_names(parse_opts.view(), rec_starts, stream); - CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n"); + auto column_names = get_column_names(parse_opts.view(), rec_starts, stream); + + CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n"); - auto dtypes = get_data_types(reader_opts, parse_opts.view(), rec_starts, stream); + auto dtypes = get_data_types(reader_opts, parse_opts.view(), column_names, rec_starts, stream); CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); - return convert_data_to_table(parse_opts.view(), dtypes, rec_starts, stream, mr); + return convert_data_to_table(parse_opts.view(), dtypes, column_names, rec_starts, stream, mr); } table_with_metadata read_json(std::vector>& sources, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 8d4900325b8..75031ebb68a 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -59,8 +59,6 @@ class reader_impl { std::vector uncomp_data_owner_; rmm::device_buffer data_; - table_metadata metadata_; - // the map is only used for files with rows in object format; initialize to a dummy value so the // map object can be passed to the kernel in any case col_map_ptr_type key_to_col_idx_map_; @@ -151,11 +149,12 @@ class reader_impl { * @param[in] rec_starts Record starts in device memory * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ - void set_column_names(parse_options_view const& parse_opts, - device_span rec_starts, - rmm::cuda_stream_view stream); + std::vector get_column_names(parse_options_view const& parse_opts, + device_span rec_starts, + rmm::cuda_stream_view stream); - std::vector parse_data_types(std::vector const& types_as_strings); + std::vector parse_data_types(std::vector const& column_names, + std::vector const& types_as_strings); /** * @brief Set the data type array data member @@ -168,6 +167,7 @@ class reader_impl { */ std::vector get_data_types(json_reader_options const& reader_opts, parse_options_view const& parse_opts, + std::vector const& column_names, device_span rec_starts, rmm::cuda_stream_view stream); @@ -182,6 +182,7 @@ class reader_impl { */ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, std::vector const& dtypes, + std::vector const& column_names, device_span rec_starts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); From 5e307b5bcb2e18f12ffd99a1ddb55497a8b68b24 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 17:30:48 -0500 Subject: [PATCH 17/32] replace json reader_impl::data_ with local variable --- cpp/src/io/json/reader_impl.cu | 106 ++++++++++++++----------- cpp/src/io/json/reader_impl.hpp | 18 +++-- cpp/src/io/utilities/parsing_utils.cu | 59 ++++++-------- cpp/src/io/utilities/parsing_utils.cuh | 14 ++-- 4 files changed, 102 insertions(+), 95 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 65179866595..7166cb776c8 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -186,13 +186,10 @@ auto sort_keys_info_by_offset(std::unique_ptr
info) std::pair, col_map_ptr_type> reader_impl::get_json_object_keys_hashes( parse_options_view const& parse_opts, device_span rec_starts, + device_span data, rmm::cuda_stream_view stream) { - auto info = create_json_keys_info_table( - parse_opts, - device_span(static_cast(data_.data()), data_.size()), - rec_starts, - stream); + auto info = create_json_keys_info_table(parse_opts, data, rec_starts, stream); auto aggregated_info = aggregate_keys_info(std::move(info)); auto sorted_info = sort_keys_info_by_offset(std::move(aggregated_info)); @@ -236,9 +233,9 @@ bool should_load_whole_source(json_reader_options const& reader_opts) * Sets the uncomp_data_ and uncomp_size_ data members * Loads the data into device memory if byte range parameters are not used */ -void reader_impl::decompress_input(json_reader_options const& reader_opts, - std::vector const& buffer, - rmm::cuda_stream_view stream) +rmm::device_buffer reader_impl::decompress_input(json_reader_options const& reader_opts, + std::vector const& buffer, + rmm::cuda_stream_view stream) { if (reader_opts.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy @@ -255,12 +252,16 @@ void reader_impl::decompress_input(json_reader_options const& reader_opts, uncomp_size_ = uncomp_data_owner_.size(); } if (should_load_whole_source(reader_opts)) { - data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); + return rmm::device_buffer(uncomp_data_, uncomp_size_, stream); + } else { + return {}; } } rmm::device_uvector reader_impl::find_record_starts( - json_reader_options const& reader_opts, rmm::cuda_stream_view stream) + json_reader_options const& reader_opts, + device_span data, + rmm::cuda_stream_view stream) { std::vector chars_to_count{'\n'}; // Currently, ignoring lineterminations within quotes is handled by recording the records of both, @@ -269,9 +270,10 @@ rmm::device_uvector reader_impl::find_record_starts( // If not starting at an offset, add an extra row to account for the first row in the file cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0); if (should_load_whole_source(reader_opts)) { - prefilter_count += count_all_from_set(data_, chars_to_count, stream); + prefilter_count += count_all_from_set(data, chars_to_count, stream); } else { - prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream); + prefilter_count += + count_all_from_set(host_span(uncomp_data_, uncomp_size_), chars_to_count, stream); } rmm::device_uvector rec_starts(prefilter_count, stream); @@ -286,9 +288,14 @@ rmm::device_uvector reader_impl::find_record_starts( std::vector chars_to_find{'\n'}; // Passing offset = 1 to return positions AFTER the found character if (should_load_whole_source(reader_opts)) { - find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream); + find_all_from_set(data, chars_to_find, 1, find_result_ptr, stream); } else { - find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream); + find_all_from_set( // + host_span(uncomp_data_, uncomp_size_), + chars_to_find, + 1, + find_result_ptr, + stream); } // Previous call stores the record pinput_file.typeositions as encountered by all threads @@ -312,9 +319,9 @@ rmm::device_uvector reader_impl::find_record_starts( * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ -void reader_impl::upload_data_to_device(json_reader_options const& reader_opts, - rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream) +rmm::device_buffer reader_impl::upload_data_to_device(json_reader_options const& reader_opts, + rmm::device_uvector& rec_starts, + rmm::cuda_stream_view stream) { size_t start_offset = 0; size_t end_offset = uncomp_size_; @@ -349,15 +356,16 @@ void reader_impl::upload_data_to_device(json_reader_options const& reader_opts, "Error finding the record within the specified byte range.\n"); // Upload the raw data that is within the rows of interest - data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream); + return rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream); } std::vector reader_impl::get_column_names(parse_options_view const& parse_opts, device_span rec_starts, + device_span data, rmm::cuda_stream_view stream) { // If file only contains one row, use the file size for the row size - uint64_t first_row_len = data_.size() / sizeof(char); + uint64_t first_row_len = data.size() / sizeof(char); if (rec_starts.size() > 1) { // Set first_row_len to the offset of the second row, if it exists CUDA_TRY(cudaMemcpyAsync(&first_row_len, @@ -368,7 +376,7 @@ std::vector reader_impl::get_column_names(parse_options_view const& } std::vector first_row(first_row_len); CUDA_TRY(cudaMemcpyAsync(first_row.data(), - data_.data(), + data.data(), first_row_len * sizeof(char), cudaMemcpyDeviceToHost, stream.value())); @@ -385,7 +393,7 @@ std::vector reader_impl::get_column_names(parse_options_view const& // If the first opening bracket is '{', assume object format if (first_curly_bracket < first_square_bracket) { // use keys as column names if input rows are objects - auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, stream); + auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, data, stream); set_column_map(std::move(keys_desc.second), stream); return keys_desc.first; } else { @@ -453,6 +461,7 @@ std::vector reader_impl::get_data_types(json_reader_options const& re parse_options_view const& parse_opts, std::vector const& column_names, device_span rec_starts, + device_span data, rmm::cuda_stream_view stream) { bool has_to_infer_column_types = @@ -483,14 +492,13 @@ std::vector reader_impl::get_data_types(json_reader_options const& re auto const num_columns = column_names.size(); auto const do_set_null_count = key_to_col_idx_map_ != nullptr; - auto const h_column_infos = cudf::io::json::gpu::detect_data_types( - parse_opts, - device_span(static_cast(data_.data()), data_.size()), - rec_starts, - do_set_null_count, - num_columns, - get_column_map_device_ptr(), - stream); + auto const h_column_infos = cudf::io::json::gpu::detect_data_types(parse_opts, + data, + rec_starts, + do_set_null_count, + num_columns, + get_column_map_device_ptr(), + stream); auto get_type_id = [&](auto const& cinfo) { auto int_count_total = @@ -532,6 +540,7 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& std::vector const& dtypes, std::vector const& column_names, device_span rec_starts, + device_span data, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -560,16 +569,15 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async(num_columns, stream); - cudf::io::json::gpu::convert_json_to_columns( - parse_opts, - device_span(static_cast(data_.data()), data_.size()), - rec_starts, - d_dtypes, - get_column_map_device_ptr(), - d_data, - d_valid, - d_valid_counts, - stream); + cudf::io::json::gpu::convert_json_to_columns(parse_opts, + data, + rec_starts, + d_dtypes, + get_column_map_device_ptr(), + d_data, + d_valid, + d_valid_counts, + stream); stream.synchronize(); @@ -649,26 +657,32 @@ table_with_metadata reader_impl::read(std::vector>& CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n"); - decompress_input(reader_opts, buffer, stream); + auto data = decompress_input(reader_opts, buffer, stream); + auto data_span = device_span(static_cast(data.data()), data.size()); CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n"); CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n"); - auto rec_starts = find_record_starts(reader_opts, stream); + auto rec_starts = find_record_starts(reader_opts, data_span, stream); + CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n"); - upload_data_to_device(reader_opts, rec_starts, stream); - CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n"); + data = upload_data_to_device(reader_opts, rec_starts, stream); + data_span = device_span(static_cast(data.data()), data.size()); + + CUDF_EXPECTS(data_span.size() != 0, "Error uploading input data to the GPU.\n"); - auto column_names = get_column_names(parse_opts.view(), rec_starts, stream); + auto column_names = get_column_names(parse_opts.view(), rec_starts, data_span, stream); CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n"); - auto dtypes = get_data_types(reader_opts, parse_opts.view(), column_names, rec_starts, stream); + auto dtypes = + get_data_types(reader_opts, parse_opts.view(), column_names, rec_starts, data_span, stream); CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); - return convert_data_to_table(parse_opts.view(), dtypes, column_names, rec_starts, stream, mr); + return convert_data_to_table( + parse_opts.view(), dtypes, column_names, rec_starts, data_span, stream, mr); } table_with_metadata read_json(std::vector>& sources, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 75031ebb68a..fa464a0ef8b 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -57,7 +57,6 @@ class reader_impl { // Used when the input data is compressed, to ensure the allocated uncompressed data is freed std::vector uncomp_data_owner_; - rmm::device_buffer data_; // the map is only used for files with rows in object format; initialize to a dummy value so the // map object can be passed to the kernel in any case @@ -108,6 +107,7 @@ class reader_impl { std::pair, col_map_ptr_type> get_json_object_keys_hashes( parse_options_view const& parse_opts, device_span rec_starts, + device_span data, rmm::cuda_stream_view stream); /** @@ -115,9 +115,9 @@ class reader_impl { * * Sets the uncomp_data_ and uncomp_size_ data members */ - void decompress_input(json_reader_options const& options, - std::vector const& buffer, - rmm::cuda_stream_view stream); + rmm::device_buffer decompress_input(json_reader_options const& options, + std::vector const& buffer, + rmm::cuda_stream_view stream); /** * @brief Finds all record starts in the file. @@ -128,6 +128,7 @@ class reader_impl { * @return Record starts in the device memory */ rmm::device_uvector find_record_starts(json_reader_options const& reader_opts, + device_span data, rmm::cuda_stream_view stream); /** @@ -137,9 +138,9 @@ class reader_impl { * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ - void upload_data_to_device(json_reader_options const& reader_opts, - rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream); + rmm::device_buffer upload_data_to_device(json_reader_options const& reader_opts, + rmm::device_uvector& rec_starts, + rmm::cuda_stream_view stream); /** * @brief Parse the first row to set the column name @@ -151,6 +152,7 @@ class reader_impl { */ std::vector get_column_names(parse_options_view const& parse_opts, device_span rec_starts, + device_span data, rmm::cuda_stream_view stream); std::vector parse_data_types(std::vector const& column_names, @@ -169,6 +171,7 @@ class reader_impl { parse_options_view const& parse_opts, std::vector const& column_names, device_span rec_starts, + device_span data, rmm::cuda_stream_view stream); /** @@ -184,6 +187,7 @@ class reader_impl { std::vector const& dtypes, std::vector const& column_names, device_span rec_starts, + device_span data, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu index ba62238c5d3..2edf2d7505e 100644 --- a/cpp/src/io/utilities/parsing_utils.cu +++ b/cpp/src/io/utilities/parsing_utils.cu @@ -100,8 +100,8 @@ __global__ void count_and_set_positions(const char* data, } // namespace template -cudf::size_type find_all_from_set(const rmm::device_buffer& d_data, - const std::vector& keys, +cudf::size_type find_all_from_set(device_span data, + std::vector const& keys, uint64_t result_offset, T* positions, rmm::cuda_stream_view stream) @@ -110,31 +110,25 @@ cudf::size_type find_all_from_set(const rmm::device_buffer& d_data, int min_grid_size = 0; // minimum block count required CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); - const int grid_size = divCeil(d_data.size(), (size_t)block_size); + const int grid_size = divCeil(data.size(), (size_t)block_size); auto d_count = cudf::detail::make_zeroed_device_uvector_async(1, stream); for (char key : keys) { - count_and_set_positions - <<>>(static_cast(d_data.data()), - d_data.size(), - result_offset, - key, - d_count.data(), - positions); + count_and_set_positions<<>>( + data.data(), data.size(), result_offset, key, d_count.data(), positions); } return cudf::detail::make_std_vector_sync(d_count, stream)[0]; } template -cudf::size_type find_all_from_set(const char* h_data, - size_t h_size, +cudf::size_type find_all_from_set(host_span data, const std::vector& keys, uint64_t result_offset, T* positions, rmm::cuda_stream_view stream) { - rmm::device_buffer d_chunk(std::min(max_chunk_bytes, h_size), stream); + rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream); auto d_count = cudf::detail::make_zeroed_device_uvector_async(1, stream); int block_size = 0; // suggested thread count to use @@ -142,13 +136,13 @@ cudf::size_type find_all_from_set(const char* h_data, CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); - const size_t chunk_count = divCeil(h_size, max_chunk_bytes); + const size_t chunk_count = divCeil(data.size(), max_chunk_bytes); for (size_t ci = 0; ci < chunk_count; ++ci) { const auto chunk_offset = ci * max_chunk_bytes; - const auto h_chunk = h_data + chunk_offset; - const int chunk_bytes = std::min((size_t)(h_size - ci * max_chunk_bytes), max_chunk_bytes); - const auto chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread); - const int grid_size = divCeil(chunk_bits, block_size); + const auto h_chunk = data.data() + chunk_offset; + const int chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes); + const auto chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread); + const int grid_size = divCeil(chunk_bits, block_size); // Copy chunk to device CUDA_TRY( @@ -168,45 +162,42 @@ cudf::size_type find_all_from_set(const char* h_data, return cudf::detail::make_std_vector_sync(d_count, stream)[0]; } -template cudf::size_type find_all_from_set(const rmm::device_buffer& d_data, - const std::vector& keys, +template cudf::size_type find_all_from_set(device_span data, + std::vector const& keys, uint64_t result_offset, uint64_t* positions, rmm::cuda_stream_view stream); -template cudf::size_type find_all_from_set(const rmm::device_buffer& d_data, - const std::vector& keys, +template cudf::size_type find_all_from_set(device_span data, + std::vector const& keys, uint64_t result_offset, pos_key_pair* positions, rmm::cuda_stream_view stream); -template cudf::size_type find_all_from_set(const char* h_data, - size_t h_size, - const std::vector& keys, +template cudf::size_type find_all_from_set(host_span data, + std::vector const& keys, uint64_t result_offset, uint64_t* positions, rmm::cuda_stream_view stream); -template cudf::size_type find_all_from_set(const char* h_data, - size_t h_size, - const std::vector& keys, +template cudf::size_type find_all_from_set(host_span data, + std::vector const& keys, uint64_t result_offset, pos_key_pair* positions, rmm::cuda_stream_view stream); -cudf::size_type count_all_from_set(const rmm::device_buffer& d_data, - const std::vector& keys, +cudf::size_type count_all_from_set(device_span data, + std::vector const& keys, rmm::cuda_stream_view stream) { - return find_all_from_set(d_data, keys, 0, nullptr, stream); + return find_all_from_set(data, keys, 0, nullptr, stream); } -cudf::size_type count_all_from_set(const char* h_data, - size_t h_size, +cudf::size_type count_all_from_set(host_span data, const std::vector& keys, rmm::cuda_stream_view stream) { - return find_all_from_set(h_data, h_size, keys, 0, nullptr, stream); + return find_all_from_set(data, keys, 0, nullptr, stream); } } // namespace io diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index daf23de7eb2..73369e75f59 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -390,8 +390,8 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const* * @return cudf::size_type total number of occurrences */ template -cudf::size_type find_all_from_set(const rmm::device_buffer& d_data, - const std::vector& keys, +cudf::size_type find_all_from_set(device_span data, + std::vector const& keys, uint64_t result_offset, T* positions, rmm::cuda_stream_view stream); @@ -414,8 +414,7 @@ cudf::size_type find_all_from_set(const rmm::device_buffer& d_data, * @return cudf::size_type total number of occurrences */ template -cudf::size_type find_all_from_set(const char* h_data, - size_t h_size, +cudf::size_type find_all_from_set(host_span data, const std::vector& keys, uint64_t result_offset, T* positions, @@ -431,8 +430,8 @@ cudf::size_type find_all_from_set(const char* h_data, * * @return cudf::size_type total number of occurrences */ -cudf::size_type count_all_from_set(const rmm::device_buffer& d_data, - const std::vector& keys, +cudf::size_type count_all_from_set(device_span data, + std::vector const& keys, rmm::cuda_stream_view stream); /** @@ -449,8 +448,7 @@ cudf::size_type count_all_from_set(const rmm::device_buffer& d_data, * * @return cudf::size_type total number of occurrences */ -cudf::size_type count_all_from_set(const char* h_data, - size_t h_size, +cudf::size_type count_all_from_set(host_span data, const std::vector& keys, rmm::cuda_stream_view stream); From f84dcc3d12f520af6bec92950b4f494ef181a2c6 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 21:09:03 -0500 Subject: [PATCH 18/32] replace json::reader_impl column_map members with local variables --- cpp/src/hash/concurrent_unordered_map.cuh | 7 ++- cpp/src/io/json/json_gpu.cu | 26 ++++++----- cpp/src/io/json/reader_impl.cu | 53 ++++++++++------------- cpp/src/io/json/reader_impl.hpp | 36 +++------------ 4 files changed, 51 insertions(+), 71 deletions(-) diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh index c4a9da9285d..a3f954920c8 100644 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ b/cpp/src/hash/concurrent_unordered_map.cuh @@ -538,8 +538,11 @@ class concurrent_unordered_map { } } - init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>( - m_hashtbl_values, m_capacity, m_unused_key, m_unused_element); + if (m_capacity > 0) { + init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>( + m_hashtbl_values, m_capacity, m_unused_key, m_unused_element); + } + CUDA_TRY(cudaGetLastError()); } }; diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index ba6bc30e0d4..d3930daefd2 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -424,19 +424,19 @@ __device__ field_descriptor next_field_descriptor(const char* begin, const char* end, parse_options_view const& opts, cudf::size_type field_idx, - col_map_type* col_map) + col_map_type col_map) { auto const desc_pre_trim = - col_map == nullptr + col_map.capacity() == 0 // No key - column and begin are trivial ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)} : [&]() { auto const key_range = get_next_key(begin, end, opts.quotechar); auto const key_hash = MurmurHash3_32{}( cudf::string_view(key_range.first, key_range.second - key_range.first)); - auto const hash_col = col_map->find(key_hash); + auto const hash_col = col_map.find(key_hash); // Fall back to field index if not found (parsing error) - auto const column = (hash_col != col_map->end()) ? (*hash_col).second : field_idx; + auto const column = (hash_col != col_map.end()) ? (*hash_col).second : field_idx; // Skip the colon between the key and the value auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1; @@ -491,7 +491,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, device_span const data, device_span const row_offsets, device_span const column_types, - col_map_type* col_map, + col_map_type col_map, device_span const output_columns, device_span const valid_fields, device_span const num_valid_fields) @@ -562,14 +562,14 @@ __global__ void detect_data_types_kernel( parse_options_view const opts, device_span const data, device_span const row_offsets, - col_map_type* col_map, + col_map_type col_map, int num_columns, device_span const column_infos) { auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x); if (rec_id >= row_offsets.size()) return; - auto const are_rows_objects = col_map != nullptr; + auto const are_rows_objects = col_map.capacity() != 0; auto const row_data_range = get_row_data_range(data, row_offsets, rec_id); size_type input_field_index = 0; @@ -768,8 +768,14 @@ void convert_json_to_columns(parse_options_view const& opts, const int grid_size = (row_offsets.size() + block_size - 1) / block_size; - convert_data_to_columns_kernel<<>>( - opts, data, row_offsets, column_types, col_map, output_columns, valid_fields, num_valid_fields); + convert_data_to_columns_kernel<<>>(opts, + data, + row_offsets, + column_types, + *col_map, + output_columns, + valid_fields, + num_valid_fields); CUDA_TRY(cudaGetLastError()); } @@ -814,7 +820,7 @@ std::vector detect_data_types( const int grid_size = (row_offsets.size() + block_size - 1) / block_size; detect_data_types_kernel<<>>( - options, data, row_offsets, col_map, num_columns, d_column_infos); + options, data, row_offsets, *col_map, num_columns, d_column_infos); return cudf::detail::make_std_vector_sync(d_column_infos, stream); } diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 7166cb776c8..4a1b0dc5afc 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -88,7 +88,7 @@ std::unique_ptr
aggregate_keys_info(std::unique_ptr
info) col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, rmm::cuda_stream_view stream) { - auto key_col_map{col_map_type::create(column_name_hashes.size(), stream)}; + auto key_col_map = col_map_type::create(column_name_hashes.size(), stream); auto const column_data = column_name_hashes.data(); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -359,10 +359,11 @@ rmm::device_buffer reader_impl::upload_data_to_device(json_reader_options const& return rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream); } -std::vector reader_impl::get_column_names(parse_options_view const& parse_opts, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream) +std::pair, col_map_ptr_type> reader_impl::get_column_names_and_map( + parse_options_view const& parse_opts, + device_span rec_starts, + device_span data, + rmm::cuda_stream_view stream) { // If file only contains one row, use the file size for the row size uint64_t first_row_len = data.size() / sizeof(char); @@ -393,9 +394,7 @@ std::vector reader_impl::get_column_names(parse_options_view const& // If the first opening bracket is '{', assume object format if (first_curly_bracket < first_square_bracket) { // use keys as column names if input rows are objects - auto keys_desc = get_json_object_keys_hashes(parse_opts, rec_starts, data, stream); - set_column_map(std::move(keys_desc.second), stream); - return keys_desc.first; + return get_json_object_keys_hashes(parse_opts, rec_starts, data, stream); } else { int cols_found = 0; bool quotation = false; @@ -411,7 +410,7 @@ std::vector reader_impl::get_column_names(parse_options_view const& column_names.emplace_back(std::to_string(cols_found++)); } } - return column_names; + return {column_names, col_map_type::create(0, stream)}; } } @@ -460,6 +459,7 @@ std::vector reader_impl::parse_data_types( std::vector reader_impl::get_data_types(json_reader_options const& reader_opts, parse_options_view const& parse_opts, std::vector const& column_names, + col_map_type* column_map, device_span rec_starts, device_span data, rmm::cuda_stream_view stream) @@ -490,15 +490,10 @@ std::vector reader_impl::get_data_types(json_reader_options const& re } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = column_names.size(); - auto const do_set_null_count = key_to_col_idx_map_ != nullptr; + auto const do_set_null_count = column_map->capacity() > 0; - auto const h_column_infos = cudf::io::json::gpu::detect_data_types(parse_opts, - data, - rec_starts, - do_set_null_count, - num_columns, - get_column_map_device_ptr(), - stream); + auto const h_column_infos = cudf::io::json::gpu::detect_data_types( + parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream); auto get_type_id = [&](auto const& cinfo) { auto int_count_total = @@ -539,6 +534,7 @@ std::vector reader_impl::get_data_types(json_reader_options const& re table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts, std::vector const& dtypes, std::vector const& column_names, + col_map_type* column_map, device_span rec_starts, device_span data, rmm::cuda_stream_view stream, @@ -569,15 +565,8 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async(num_columns, stream); - cudf::io::json::gpu::convert_json_to_columns(parse_opts, - data, - rec_starts, - d_dtypes, - get_column_map_device_ptr(), - d_data, - d_valid, - d_valid_counts, - stream); + cudf::io::json::gpu::convert_json_to_columns( + parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream); stream.synchronize(); @@ -672,17 +661,21 @@ table_with_metadata reader_impl::read(std::vector>& CUDF_EXPECTS(data_span.size() != 0, "Error uploading input data to the GPU.\n"); - auto column_names = get_column_names(parse_opts.view(), rec_starts, data_span, stream); + auto column_names_and_map = + get_column_names_and_map(parse_opts.view(), rec_starts, data_span, stream); + + auto column_names = std::get<0>(column_names_and_map); + auto column_map = std::move(std::get<1>(column_names_and_map)); CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n"); - auto dtypes = - get_data_types(reader_opts, parse_opts.view(), column_names, rec_starts, data_span, stream); + auto dtypes = get_data_types( + reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, data_span, stream); CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); return convert_data_to_table( - parse_opts.view(), dtypes, column_names, rec_starts, data_span, stream, mr); + parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, data_span, stream, mr); } table_with_metadata read_json(std::vector>& sources, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index fa464a0ef8b..bf421f1604d 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -58,31 +58,6 @@ class reader_impl { // Used when the input data is compressed, to ensure the allocated uncompressed data is freed std::vector uncomp_data_owner_; - // the map is only used for files with rows in object format; initialize to a dummy value so the - // map object can be passed to the kernel in any case - col_map_ptr_type key_to_col_idx_map_; - std::unique_ptr> d_key_col_map_; - - /** - * @brief Sets the column map data member and makes a device copy to be used as a kernel - * parameter. - */ - void set_column_map(col_map_ptr_type&& map, rmm::cuda_stream_view stream) - { - key_to_col_idx_map_ = std::move(map); - d_key_col_map_ = - std::make_unique>(*key_to_col_idx_map_, stream); - } - /** - * @brief Gets the pointer to the column hash map in the device memory. - * - * Returns `nullptr` if the map is not created. - */ - auto get_column_map_device_ptr() - { - return key_to_col_idx_map_ ? d_key_col_map_->data() : nullptr; - } - /** * @brief Ingest input JSON file/buffer, without decompression * @@ -150,10 +125,11 @@ class reader_impl { * @param[in] rec_starts Record starts in device memory * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ - std::vector get_column_names(parse_options_view const& parse_opts, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream); + std::pair, col_map_ptr_type> get_column_names_and_map( + parse_options_view const& parse_opts, + device_span rec_starts, + device_span data, + rmm::cuda_stream_view stream); std::vector parse_data_types(std::vector const& column_names, std::vector const& types_as_strings); @@ -170,6 +146,7 @@ class reader_impl { std::vector get_data_types(json_reader_options const& reader_opts, parse_options_view const& parse_opts, std::vector const& column_names, + col_map_type* column_map, device_span rec_starts, device_span data, rmm::cuda_stream_view stream); @@ -186,6 +163,7 @@ class reader_impl { table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, std::vector const& dtypes, std::vector const& column_names, + col_map_type* column_map, device_span rec_starts, device_span data, rmm::cuda_stream_view stream, From e11e9dbe3ae694314548f008bd76eac0498a319c Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 21:28:33 -0500 Subject: [PATCH 19/32] change json::reader_impl host buffer type from uint8_t to char --- cpp/src/io/json/reader_impl.cu | 15 ++++++++------- cpp/src/io/json/reader_impl.hpp | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 4a1b0dc5afc..dd5305ebfb8 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -199,7 +199,7 @@ std::pair, col_map_ptr_type> reader_impl::get_json_obje } void reader_impl::ingest_raw_input(std::vector> const& sources, - std::vector& buffer, + std::vector& buffer, size_t range_offset, size_t range_size, size_t range_size_padded) @@ -215,8 +215,9 @@ void reader_impl::ingest_raw_input(std::vector> cons size_t bytes_read = 0; for (const auto& source : sources) { if (!source->is_empty()) { - auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); - bytes_read += source->host_read(range_offset, data_size, &buffer[bytes_read]); + auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); + auto destination = reinterpret_cast(buffer.data()) + bytes_read; + bytes_read += source->host_read(range_offset, data_size, destination); } } } @@ -234,17 +235,17 @@ bool should_load_whole_source(json_reader_options const& reader_opts) * Loads the data into device memory if byte range parameters are not used */ rmm::device_buffer reader_impl::decompress_input(json_reader_options const& reader_opts, - std::vector const& buffer, + std::vector const& buffer, rmm::cuda_stream_view stream) { if (reader_opts.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy - uncomp_data_ = reinterpret_cast(buffer.data()); + uncomp_data_ = buffer.data(); uncomp_size_ = buffer.size(); } else { uncomp_data_owner_ = get_uncompressed_data( // host_span( // - reinterpret_cast(buffer.data()), + buffer.data(), buffer.size()), reader_opts.get_compression()); @@ -640,7 +641,7 @@ table_with_metadata reader_impl::read(std::vector>& auto range_size = reader_opts.get_byte_range_size(); auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); - std::vector buffer; + std::vector buffer; ingest_raw_input(sources, buffer, range_offset, range_size, range_size_padded); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index bf421f1604d..0e6a8005055 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -69,7 +69,7 @@ class reader_impl { * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ void ingest_raw_input(std::vector> const& sources, - std::vector& buffer, + std::vector& buffer, size_t range_offset, size_t range_size, size_t range_size_padded); @@ -91,7 +91,7 @@ class reader_impl { * Sets the uncomp_data_ and uncomp_size_ data members */ rmm::device_buffer decompress_input(json_reader_options const& options, - std::vector const& buffer, + std::vector const& buffer, rmm::cuda_stream_view stream); /** From df102172095e36958023412bf97729bef04c2f9d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 21:42:40 -0500 Subject: [PATCH 20/32] replace json::reader_impl uncomp data members with single span member --- cpp/src/io/json/reader_impl.cu | 60 +++++++++++++++------------------ cpp/src/io/json/reader_impl.hpp | 15 ++++----- 2 files changed, 35 insertions(+), 40 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index dd5305ebfb8..9e01e9f8405 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -194,7 +194,7 @@ std::pair, col_map_ptr_type> reader_impl::get_json_obje auto aggregated_info = aggregate_keys_info(std::move(info)); auto sorted_info = sort_keys_info_by_offset(std::move(aggregated_info)); - return {create_key_strings(uncomp_data_, sorted_info->view(), stream), + return {create_key_strings(uncomp_data_.data(), sorted_info->view(), stream), create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; } @@ -234,14 +234,13 @@ bool should_load_whole_source(json_reader_options const& reader_opts) * Sets the uncomp_data_ and uncomp_size_ data members * Loads the data into device memory if byte range parameters are not used */ -rmm::device_buffer reader_impl::decompress_input(json_reader_options const& reader_opts, - std::vector const& buffer, - rmm::cuda_stream_view stream) +rmm::device_uvector reader_impl::decompress_input(json_reader_options const& reader_opts, + std::vector const& buffer, + rmm::cuda_stream_view stream) { if (reader_opts.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy - uncomp_data_ = buffer.data(); - uncomp_size_ = buffer.size(); + uncomp_data_ = host_span(static_cast(buffer.data()), buffer.size()); } else { uncomp_data_owner_ = get_uncompressed_data( // host_span( // @@ -249,13 +248,12 @@ rmm::device_buffer reader_impl::decompress_input(json_reader_options const& read buffer.size()), reader_opts.get_compression()); - uncomp_data_ = uncomp_data_owner_.data(); - uncomp_size_ = uncomp_data_owner_.size(); + uncomp_data_ = host_span(uncomp_data_owner_.data(), uncomp_data_owner_.size()); } if (should_load_whole_source(reader_opts)) { - return rmm::device_buffer(uncomp_data_, uncomp_size_, stream); + return cudf::detail::make_device_uvector_async(uncomp_data_, stream); } else { - return {}; + return rmm::device_uvector(0, stream); } } @@ -273,8 +271,7 @@ rmm::device_uvector reader_impl::find_record_starts( if (should_load_whole_source(reader_opts)) { prefilter_count += count_all_from_set(data, chars_to_count, stream); } else { - prefilter_count += - count_all_from_set(host_span(uncomp_data_, uncomp_size_), chars_to_count, stream); + prefilter_count += count_all_from_set(uncomp_data_, chars_to_count, stream); } rmm::device_uvector rec_starts(prefilter_count, stream); @@ -292,7 +289,7 @@ rmm::device_uvector reader_impl::find_record_starts( find_all_from_set(data, chars_to_find, 1, find_result_ptr, stream); } else { find_all_from_set( // - host_span(uncomp_data_, uncomp_size_), + uncomp_data_, chars_to_find, 1, find_result_ptr, @@ -307,7 +304,7 @@ rmm::device_uvector reader_impl::find_record_starts( auto filtered_count = prefilter_count; // Exclude the ending newline as it does not precede a record start - if (uncomp_data_[uncomp_size_ - 1] == '\n') { filtered_count--; } + if (uncomp_data_.back() == '\n') { filtered_count--; } rec_starts.resize(filtered_count, stream); return rec_starts; @@ -320,12 +317,13 @@ rmm::device_uvector reader_impl::find_record_starts( * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ -rmm::device_buffer reader_impl::upload_data_to_device(json_reader_options const& reader_opts, - rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream) +rmm::device_uvector reader_impl::upload_data_to_device( + json_reader_options const& reader_opts, + rmm::device_uvector& rec_starts, + rmm::cuda_stream_view stream) { size_t start_offset = 0; - size_t end_offset = uncomp_size_; + size_t end_offset = uncomp_data_.size(); // Trim lines that are outside range if (reader_opts.get_byte_range_size() != 0 || reader_opts.get_byte_range_offset() != 0) { @@ -353,11 +351,12 @@ rmm::device_buffer reader_impl::upload_data_to_device(json_reader_options const& } const size_t bytes_to_upload = end_offset - start_offset; - CUDF_EXPECTS(bytes_to_upload <= uncomp_size_, + CUDF_EXPECTS(bytes_to_upload <= uncomp_data_.size(), "Error finding the record within the specified byte range.\n"); // Upload the raw data that is within the rows of interest - return rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream); + return cudf::detail::make_device_uvector_async( + uncomp_data_.subspan(start_offset, bytes_to_upload), stream); } std::pair, col_map_ptr_type> reader_impl::get_column_names_and_map( @@ -647,23 +646,20 @@ table_with_metadata reader_impl::read(std::vector>& CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n"); - auto data = decompress_input(reader_opts, buffer, stream); - auto data_span = device_span(static_cast(data.data()), data.size()); + auto data = decompress_input(reader_opts, buffer, stream); - CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n"); - CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n"); + CUDF_EXPECTS(uncomp_data_.data() != nullptr, "Ingest failed: uncompressed input data is null.\n"); + CUDF_EXPECTS(uncomp_data_.size() != 0, "Ingest failed: uncompressed input data has zero size.\n"); - auto rec_starts = find_record_starts(reader_opts, data_span, stream); + auto rec_starts = find_record_starts(reader_opts, data, stream); CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n"); - data = upload_data_to_device(reader_opts, rec_starts, stream); - data_span = device_span(static_cast(data.data()), data.size()); + data = upload_data_to_device(reader_opts, rec_starts, stream); - CUDF_EXPECTS(data_span.size() != 0, "Error uploading input data to the GPU.\n"); + CUDF_EXPECTS(data.size() != 0, "Error uploading input data to the GPU.\n"); - auto column_names_and_map = - get_column_names_and_map(parse_opts.view(), rec_starts, data_span, stream); + auto column_names_and_map = get_column_names_and_map(parse_opts.view(), rec_starts, data, stream); auto column_names = std::get<0>(column_names_and_map); auto column_map = std::move(std::get<1>(column_names_and_map)); @@ -671,12 +667,12 @@ table_with_metadata reader_impl::read(std::vector>& CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n"); auto dtypes = get_data_types( - reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, data_span, stream); + reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, data, stream); CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); return convert_data_to_table( - parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, data_span, stream, mr); + parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, data, stream, mr); } table_with_metadata read_json(std::vector>& sources, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 0e6a8005055..79d70b2e121 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -52,8 +52,7 @@ using col_map_ptr_type = std::unique_ptr uncomp_data_; // Used when the input data is compressed, to ensure the allocated uncompressed data is freed std::vector uncomp_data_owner_; @@ -90,9 +89,9 @@ class reader_impl { * * Sets the uncomp_data_ and uncomp_size_ data members */ - rmm::device_buffer decompress_input(json_reader_options const& options, - std::vector const& buffer, - rmm::cuda_stream_view stream); + rmm::device_uvector decompress_input(json_reader_options const& options, + std::vector const& buffer, + rmm::cuda_stream_view stream); /** * @brief Finds all record starts in the file. @@ -113,9 +112,9 @@ class reader_impl { * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ - rmm::device_buffer upload_data_to_device(json_reader_options const& reader_opts, - rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream); + rmm::device_uvector upload_data_to_device(json_reader_options const& reader_opts, + rmm::device_uvector& rec_starts, + rmm::cuda_stream_view stream); /** * @brief Parse the first row to set the column name From 0417be815745137ad10605ccdabea33f9c4c481a Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 22:06:19 -0500 Subject: [PATCH 21/32] json::reader_impl simplify device data copy logic --- cpp/src/io/json/reader_impl.cu | 68 ++++++++++++++++----------------- cpp/src/io/json/reader_impl.hpp | 6 +-- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 9e01e9f8405..94d9a1b6574 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -234,9 +234,9 @@ bool should_load_whole_source(json_reader_options const& reader_opts) * Sets the uncomp_data_ and uncomp_size_ data members * Loads the data into device memory if byte range parameters are not used */ -rmm::device_uvector reader_impl::decompress_input(json_reader_options const& reader_opts, - std::vector const& buffer, - rmm::cuda_stream_view stream) +void reader_impl::decompress_input(json_reader_options const& reader_opts, + std::vector const& buffer, + rmm::cuda_stream_view stream) { if (reader_opts.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy @@ -250,11 +250,6 @@ rmm::device_uvector reader_impl::decompress_input(json_reader_options cons uncomp_data_ = host_span(uncomp_data_owner_.data(), uncomp_data_owner_.size()); } - if (should_load_whole_source(reader_opts)) { - return cudf::detail::make_device_uvector_async(uncomp_data_, stream); - } else { - return rmm::device_uvector(0, stream); - } } rmm::device_uvector reader_impl::find_record_starts( @@ -322,34 +317,31 @@ rmm::device_uvector reader_impl::upload_data_to_device( rmm::device_uvector& rec_starts, rmm::cuda_stream_view stream) { - size_t start_offset = 0; - size_t end_offset = uncomp_data_.size(); + size_t end_offset = uncomp_data_.size(); // Trim lines that are outside range - if (reader_opts.get_byte_range_size() != 0 || reader_opts.get_byte_range_offset() != 0) { - auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream); - - if (reader_opts.get_byte_range_size() != 0) { - auto it = h_rec_starts.end() - 1; - while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) { - end_offset = *it; - --it; - } - h_rec_starts.erase(it + 1, h_rec_starts.end()); - } + auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream); - // Resize to exclude rows outside of the range - // Adjust row start positions to account for the data subcopy - start_offset = h_rec_starts.front(); - rec_starts.resize(h_rec_starts.size(), stream); - thrust::transform(rmm::exec_policy(stream), - rec_starts.begin(), - rec_starts.end(), - thrust::make_constant_iterator(start_offset), - rec_starts.begin(), - thrust::minus()); + if (reader_opts.get_byte_range_size() != 0) { + auto it = h_rec_starts.end() - 1; + while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) { + end_offset = *it; + --it; + } + h_rec_starts.erase(it + 1, h_rec_starts.end()); } + // Resize to exclude rows outside of the range + // Adjust row start positions to account for the data subcopy + size_t start_offset = h_rec_starts.front(); + rec_starts.resize(h_rec_starts.size(), stream); + thrust::transform(rmm::exec_policy(stream), + rec_starts.begin(), + rec_starts.end(), + thrust::make_constant_iterator(start_offset), + rec_starts.begin(), + thrust::minus()); + const size_t bytes_to_upload = end_offset - start_offset; CUDF_EXPECTS(bytes_to_upload <= uncomp_data_.size(), "Error finding the record within the specified byte range.\n"); @@ -646,16 +638,24 @@ table_with_metadata reader_impl::read(std::vector>& CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n"); - auto data = decompress_input(reader_opts, buffer, stream); + decompress_input(reader_opts, buffer, stream); CUDF_EXPECTS(uncomp_data_.data() != nullptr, "Ingest failed: uncompressed input data is null.\n"); CUDF_EXPECTS(uncomp_data_.size() != 0, "Ingest failed: uncompressed input data has zero size.\n"); + auto data = rmm::device_uvector(0, stream); + + if (should_load_whole_source(reader_opts)) { + data = cudf::detail::make_device_uvector_async(uncomp_data_, stream); + } + auto rec_starts = find_record_starts(reader_opts, data, stream); - CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n"); + CUDF_EXPECTS(rec_starts.size() > 0, "Error enumerating records.\n"); - data = upload_data_to_device(reader_opts, rec_starts, stream); + if (not should_load_whole_source(reader_opts)) { + data = upload_data_to_device(reader_opts, rec_starts, stream); + } CUDF_EXPECTS(data.size() != 0, "Error uploading input data to the GPU.\n"); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 79d70b2e121..807cc98e751 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -89,9 +89,9 @@ class reader_impl { * * Sets the uncomp_data_ and uncomp_size_ data members */ - rmm::device_uvector decompress_input(json_reader_options const& options, - std::vector const& buffer, - rmm::cuda_stream_view stream); + void decompress_input(json_reader_options const& options, + std::vector const& buffer, + rmm::cuda_stream_view stream); /** * @brief Finds all record starts in the file. From f0fd5c132cffca4991a367c4dcb9651446ce3b36 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 22:37:48 -0500 Subject: [PATCH 22/32] remove json::reader_impl::decompress_input function and inline the logic --- cpp/src/io/json/reader_impl.cu | 51 +++++++++++---------------------- cpp/src/io/json/reader_impl.hpp | 21 +++----------- 2 files changed, 21 insertions(+), 51 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 94d9a1b6574..d1da78d018f 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -198,11 +198,11 @@ std::pair, col_map_ptr_type> reader_impl::get_json_obje create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; } -void reader_impl::ingest_raw_input(std::vector> const& sources, - std::vector& buffer, - size_t range_offset, - size_t range_size, - size_t range_size_padded) +std::vector reader_impl::ingest_raw_input( + std::vector> const& sources, + size_t range_offset, + size_t range_size, + size_t range_size_padded) { // Iterate through the user defined sources and read the contents into the local buffer size_t total_source_size = 0; @@ -211,7 +211,8 @@ void reader_impl::ingest_raw_input(std::vector> cons } total_source_size = total_source_size - (range_offset * sources.size()); - buffer.resize(total_source_size); + auto buffer = std::vector(total_source_size); + size_t bytes_read = 0; for (const auto& source : sources) { if (!source->is_empty()) { @@ -220,6 +221,8 @@ void reader_impl::ingest_raw_input(std::vector> cons bytes_read += source->host_read(range_offset, data_size, destination); } } + + return buffer; } bool should_load_whole_source(json_reader_options const& reader_opts) @@ -228,30 +231,6 @@ bool should_load_whole_source(json_reader_options const& reader_opts) reader_opts.get_byte_range_size() == 0; } -/** - * @brief Decompress the input data, if needed - * - * Sets the uncomp_data_ and uncomp_size_ data members - * Loads the data into device memory if byte range parameters are not used - */ -void reader_impl::decompress_input(json_reader_options const& reader_opts, - std::vector const& buffer, - rmm::cuda_stream_view stream) -{ - if (reader_opts.get_compression() == compression_type::NONE) { - // Do not use the owner vector here to avoid extra copy - uncomp_data_ = host_span(static_cast(buffer.data()), buffer.size()); - } else { - uncomp_data_owner_ = get_uncompressed_data( // - host_span( // - buffer.data(), - buffer.size()), - reader_opts.get_compression()); - - uncomp_data_ = host_span(uncomp_data_owner_.data(), uncomp_data_owner_.size()); - } -} - rmm::device_uvector reader_impl::find_record_starts( json_reader_options const& reader_opts, device_span data, @@ -632,13 +611,17 @@ table_with_metadata reader_impl::read(std::vector>& auto range_size = reader_opts.get_byte_range_size(); auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); - std::vector buffer; - - ingest_raw_input(sources, buffer, range_offset, range_size, range_size_padded); + auto buffer = ingest_raw_input(sources, range_offset, range_size, range_size_padded); CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n"); - decompress_input(reader_opts, buffer, stream); + if (reader_opts.get_compression() != compression_type::NONE) { + buffer = get_uncompressed_data( // + host_span(buffer.data(), buffer.size()), + reader_opts.get_compression()); + } + + uncomp_data_ = host_span(static_cast(buffer.data()), buffer.size()); CUDF_EXPECTS(uncomp_data_.data() != nullptr, "Ingest failed: uncompressed input data is null.\n"); CUDF_EXPECTS(uncomp_data_.size() != 0, "Ingest failed: uncompressed input data has zero size.\n"); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 807cc98e751..98fceb78931 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -54,9 +54,6 @@ class reader_impl { private: host_span uncomp_data_; - // Used when the input data is compressed, to ensure the allocated uncompressed data is freed - std::vector uncomp_data_owner_; - /** * @brief Ingest input JSON file/buffer, without decompression * @@ -67,11 +64,10 @@ class reader_impl { * @param[in] range_size Bytes to read; use `0` for all remaining data * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ - void ingest_raw_input(std::vector> const& sources, - std::vector& buffer, - size_t range_offset, - size_t range_size, - size_t range_size_padded); + std::vector ingest_raw_input(std::vector> const& sources, + size_t range_offset, + size_t range_size, + size_t range_size_padded); /** * @brief Extract the JSON objects keys from the input file with object rows. @@ -84,15 +80,6 @@ class reader_impl { device_span data, rmm::cuda_stream_view stream); - /** - * @brief Decompress the input data, if needed - * - * Sets the uncomp_data_ and uncomp_size_ data members - */ - void decompress_input(json_reader_options const& options, - std::vector const& buffer, - rmm::cuda_stream_view stream); - /** * @brief Finds all record starts in the file. * From 45714bb0b392849a81d1001ab16c309385c51901 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 22:49:38 -0500 Subject: [PATCH 23/32] replace json::reader_impl::uncomp_data_ member with local variable --- cpp/src/io/json/reader_impl.cu | 74 ++++++++++++++++----------------- cpp/src/io/json/reader_impl.hpp | 13 +++--- 2 files changed, 43 insertions(+), 44 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index d1da78d018f..88432d72f3f 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -185,16 +185,17 @@ auto sort_keys_info_by_offset(std::unique_ptr
info) */ std::pair, col_map_ptr_type> reader_impl::get_json_object_keys_hashes( parse_options_view const& parse_opts, + host_span h_data, device_span rec_starts, - device_span data, + device_span d_data, rmm::cuda_stream_view stream) { - auto info = create_json_keys_info_table(parse_opts, data, rec_starts, stream); + auto info = create_json_keys_info_table(parse_opts, d_data, rec_starts, stream); auto aggregated_info = aggregate_keys_info(std::move(info)); auto sorted_info = sort_keys_info_by_offset(std::move(aggregated_info)); - return {create_key_strings(uncomp_data_.data(), sorted_info->view(), stream), + return {create_key_strings(h_data.data(), sorted_info->view(), stream), create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; } @@ -233,7 +234,8 @@ bool should_load_whole_source(json_reader_options const& reader_opts) rmm::device_uvector reader_impl::find_record_starts( json_reader_options const& reader_opts, - device_span data, + host_span h_data, + device_span d_data, rmm::cuda_stream_view stream) { std::vector chars_to_count{'\n'}; @@ -243,9 +245,9 @@ rmm::device_uvector reader_impl::find_record_starts( // If not starting at an offset, add an extra row to account for the first row in the file cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0); if (should_load_whole_source(reader_opts)) { - prefilter_count += count_all_from_set(data, chars_to_count, stream); + prefilter_count += count_all_from_set(d_data, chars_to_count, stream); } else { - prefilter_count += count_all_from_set(uncomp_data_, chars_to_count, stream); + prefilter_count += count_all_from_set(h_data, chars_to_count, stream); } rmm::device_uvector rec_starts(prefilter_count, stream); @@ -260,14 +262,9 @@ rmm::device_uvector reader_impl::find_record_starts( std::vector chars_to_find{'\n'}; // Passing offset = 1 to return positions AFTER the found character if (should_load_whole_source(reader_opts)) { - find_all_from_set(data, chars_to_find, 1, find_result_ptr, stream); + find_all_from_set(d_data, chars_to_find, 1, find_result_ptr, stream); } else { - find_all_from_set( // - uncomp_data_, - chars_to_find, - 1, - find_result_ptr, - stream); + find_all_from_set(h_data, chars_to_find, 1, find_result_ptr, stream); } // Previous call stores the record pinput_file.typeositions as encountered by all threads @@ -278,7 +275,7 @@ rmm::device_uvector reader_impl::find_record_starts( auto filtered_count = prefilter_count; // Exclude the ending newline as it does not precede a record start - if (uncomp_data_.back() == '\n') { filtered_count--; } + if (h_data.back() == '\n') { filtered_count--; } rec_starts.resize(filtered_count, stream); return rec_starts; @@ -293,10 +290,11 @@ rmm::device_uvector reader_impl::find_record_starts( */ rmm::device_uvector reader_impl::upload_data_to_device( json_reader_options const& reader_opts, + host_span h_data, rmm::device_uvector& rec_starts, rmm::cuda_stream_view stream) { - size_t end_offset = uncomp_data_.size(); + size_t end_offset = h_data.size(); // Trim lines that are outside range auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream); @@ -322,22 +320,23 @@ rmm::device_uvector reader_impl::upload_data_to_device( thrust::minus()); const size_t bytes_to_upload = end_offset - start_offset; - CUDF_EXPECTS(bytes_to_upload <= uncomp_data_.size(), + CUDF_EXPECTS(bytes_to_upload <= h_data.size(), "Error finding the record within the specified byte range.\n"); // Upload the raw data that is within the rows of interest - return cudf::detail::make_device_uvector_async( - uncomp_data_.subspan(start_offset, bytes_to_upload), stream); + return cudf::detail::make_device_uvector_async(h_data.subspan(start_offset, bytes_to_upload), + stream); } std::pair, col_map_ptr_type> reader_impl::get_column_names_and_map( parse_options_view const& parse_opts, + host_span h_data, device_span rec_starts, - device_span data, + device_span d_data, rmm::cuda_stream_view stream) { // If file only contains one row, use the file size for the row size - uint64_t first_row_len = data.size() / sizeof(char); + uint64_t first_row_len = d_data.size() / sizeof(char); if (rec_starts.size() > 1) { // Set first_row_len to the offset of the second row, if it exists CUDA_TRY(cudaMemcpyAsync(&first_row_len, @@ -348,7 +347,7 @@ std::pair, col_map_ptr_type> reader_impl::get_column_na } std::vector first_row(first_row_len); CUDA_TRY(cudaMemcpyAsync(first_row.data(), - data.data(), + d_data.data(), first_row_len * sizeof(char), cudaMemcpyDeviceToHost, stream.value())); @@ -365,7 +364,7 @@ std::pair, col_map_ptr_type> reader_impl::get_column_na // If the first opening bracket is '{', assume object format if (first_curly_bracket < first_square_bracket) { // use keys as column names if input rows are objects - return get_json_object_keys_hashes(parse_opts, rec_starts, data, stream); + return get_json_object_keys_hashes(parse_opts, h_data, rec_starts, d_data, stream); } else { int cols_found = 0; bool quotation = false; @@ -611,38 +610,37 @@ table_with_metadata reader_impl::read(std::vector>& auto range_size = reader_opts.get_byte_range_size(); auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); - auto buffer = ingest_raw_input(sources, range_offset, range_size, range_size_padded); + auto h_data = ingest_raw_input(sources, range_offset, range_size, range_size_padded); - CUDF_EXPECTS(buffer.size() != 0, "Ingest failed: input data is null.\n"); + CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: input data is null.\n"); if (reader_opts.get_compression() != compression_type::NONE) { - buffer = get_uncompressed_data( // - host_span(buffer.data(), buffer.size()), + h_data = get_uncompressed_data( // + host_span(h_data.data(), h_data.size()), reader_opts.get_compression()); } - uncomp_data_ = host_span(static_cast(buffer.data()), buffer.size()); + CUDF_EXPECTS(h_data.data() != nullptr, "Ingest failed: uncompressed input data is null.\n"); + CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: uncompressed input data has zero size.\n"); - CUDF_EXPECTS(uncomp_data_.data() != nullptr, "Ingest failed: uncompressed input data is null.\n"); - CUDF_EXPECTS(uncomp_data_.size() != 0, "Ingest failed: uncompressed input data has zero size.\n"); - - auto data = rmm::device_uvector(0, stream); + auto d_data = rmm::device_uvector(0, stream); if (should_load_whole_source(reader_opts)) { - data = cudf::detail::make_device_uvector_async(uncomp_data_, stream); + d_data = cudf::detail::make_device_uvector_async(h_data, stream); } - auto rec_starts = find_record_starts(reader_opts, data, stream); + auto rec_starts = find_record_starts(reader_opts, h_data, d_data, stream); CUDF_EXPECTS(rec_starts.size() > 0, "Error enumerating records.\n"); if (not should_load_whole_source(reader_opts)) { - data = upload_data_to_device(reader_opts, rec_starts, stream); + d_data = upload_data_to_device(reader_opts, h_data, rec_starts, stream); } - CUDF_EXPECTS(data.size() != 0, "Error uploading input data to the GPU.\n"); + CUDF_EXPECTS(d_data.size() != 0, "Error uploading input data to the GPU.\n"); - auto column_names_and_map = get_column_names_and_map(parse_opts.view(), rec_starts, data, stream); + auto column_names_and_map = + get_column_names_and_map(parse_opts.view(), h_data, rec_starts, d_data, stream); auto column_names = std::get<0>(column_names_and_map); auto column_map = std::move(std::get<1>(column_names_and_map)); @@ -650,12 +648,12 @@ table_with_metadata reader_impl::read(std::vector>& CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n"); auto dtypes = get_data_types( - reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, data, stream); + reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, d_data, stream); CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); return convert_data_to_table( - parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, data, stream, mr); + parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, d_data, stream, mr); } table_with_metadata read_json(std::vector>& sources, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 98fceb78931..aace05403e5 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -50,10 +50,7 @@ using col_map_ptr_type = std::unique_ptr uncomp_data_; - /** * @brief Ingest input JSON file/buffer, without decompression * @@ -76,8 +73,9 @@ class reader_impl { */ std::pair, col_map_ptr_type> get_json_object_keys_hashes( parse_options_view const& parse_opts, + host_span h_data, device_span rec_starts, - device_span data, + device_span d_data, rmm::cuda_stream_view stream); /** @@ -89,7 +87,8 @@ class reader_impl { * @return Record starts in the device memory */ rmm::device_uvector find_record_starts(json_reader_options const& reader_opts, - device_span data, + host_span h_data, + device_span d_data, rmm::cuda_stream_view stream); /** @@ -100,6 +99,7 @@ class reader_impl { * Also updates the array of record starts to match the device data offset. */ rmm::device_uvector upload_data_to_device(json_reader_options const& reader_opts, + host_span h_data, rmm::device_uvector& rec_starts, rmm::cuda_stream_view stream); @@ -113,8 +113,9 @@ class reader_impl { */ std::pair, col_map_ptr_type> get_column_names_and_map( parse_options_view const& parse_opts, + host_span h_data, device_span rec_starts, - device_span data, + device_span d_data, rmm::cuda_stream_view stream); std::vector parse_data_types(std::vector const& column_names, From 8d18ffe4e8bdc7965f6c3bf92efead93ff5b98c9 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 23:03:04 -0500 Subject: [PATCH 24/32] relocate json::reader_impl decompression code --- cpp/src/io/json/reader_impl.cu | 19 ++++++++----------- cpp/src/io/json/reader_impl.hpp | 1 + 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 88432d72f3f..0f2034d15ab 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -201,6 +201,7 @@ std::pair, col_map_ptr_type> reader_impl::get_json_obje std::vector reader_impl::ingest_raw_input( std::vector> const& sources, + compression_type compression, size_t range_offset, size_t range_size, size_t range_size_padded) @@ -223,7 +224,11 @@ std::vector reader_impl::ingest_raw_input( } } - return buffer; + if (compression == compression_type::NONE) { + return buffer; + } else { + return get_uncompressed_data(buffer, compression); + } } bool should_load_whole_source(json_reader_options const& reader_opts) @@ -610,17 +615,9 @@ table_with_metadata reader_impl::read(std::vector>& auto range_size = reader_opts.get_byte_range_size(); auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); - auto h_data = ingest_raw_input(sources, range_offset, range_size, range_size_padded); - - CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: input data is null.\n"); - - if (reader_opts.get_compression() != compression_type::NONE) { - h_data = get_uncompressed_data( // - host_span(h_data.data(), h_data.size()), - reader_opts.get_compression()); - } + auto h_data = ingest_raw_input( + sources, reader_opts.get_compression(), range_offset, range_size, range_size_padded); - CUDF_EXPECTS(h_data.data() != nullptr, "Ingest failed: uncompressed input data is null.\n"); CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: uncompressed input data has zero size.\n"); auto d_data = rmm::device_uvector(0, stream); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index aace05403e5..22c016c6613 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -62,6 +62,7 @@ class reader_impl { * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ std::vector ingest_raw_input(std::vector> const& sources, + compression_type compression, size_t range_offset, size_t range_size, size_t range_size_padded); From 06a39b7f2fdb733d2e83443c94dd40c3d932f610 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 23:13:11 -0500 Subject: [PATCH 25/32] remove unneccessary json::reader_impl class --- cpp/src/io/json/reader_impl.cu | 105 ++++++++++--------- cpp/src/io/json/reader_impl.hpp | 180 -------------------------------- 2 files changed, 53 insertions(+), 232 deletions(-) delete mode 100644 cpp/src/io/json/reader_impl.hpp diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 0f2034d15ab..745bfd40888 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -19,25 +19,33 @@ * @brief cuDF-IO JSON reader class implementation */ -#include "reader_impl.hpp" +#include "json_common.h" +#include "json_gpu.h" + +#include #include +#include #include +#include #include #include #include #include #include +#include +#include +#include #include #include #include #include #include -#include #include #include +#include #include #include @@ -48,7 +56,12 @@ namespace cudf { namespace io { namespace detail { namespace json { + using namespace cudf::io; +using namespace cudf::io::json; + +using col_map_type = cudf::io::json::gpu::col_map_type; +using col_map_ptr_type = std::unique_ptr>; /** * @brief Aggregate the table containing keys info by their hash values. @@ -183,7 +196,7 @@ auto sort_keys_info_by_offset(std::unique_ptr
info) * * @return Names of JSON object keys in the file */ -std::pair, col_map_ptr_type> reader_impl::get_json_object_keys_hashes( +std::pair, col_map_ptr_type> get_json_object_keys_hashes( parse_options_view const& parse_opts, host_span h_data, device_span rec_starts, @@ -199,12 +212,11 @@ std::pair, col_map_ptr_type> reader_impl::get_json_obje create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; } -std::vector reader_impl::ingest_raw_input( - std::vector> const& sources, - compression_type compression, - size_t range_offset, - size_t range_size, - size_t range_size_padded) +std::vector ingest_raw_input(std::vector> const& sources, + compression_type compression, + size_t range_offset, + size_t range_size, + size_t range_size_padded) { // Iterate through the user defined sources and read the contents into the local buffer size_t total_source_size = 0; @@ -237,11 +249,10 @@ bool should_load_whole_source(json_reader_options const& reader_opts) reader_opts.get_byte_range_size() == 0; } -rmm::device_uvector reader_impl::find_record_starts( - json_reader_options const& reader_opts, - host_span h_data, - device_span d_data, - rmm::cuda_stream_view stream) +rmm::device_uvector find_record_starts(json_reader_options const& reader_opts, + host_span h_data, + device_span d_data, + rmm::cuda_stream_view stream) { std::vector chars_to_count{'\n'}; // Currently, ignoring lineterminations within quotes is handled by recording the records of both, @@ -293,11 +304,10 @@ rmm::device_uvector reader_impl::find_record_starts( * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ -rmm::device_uvector reader_impl::upload_data_to_device( - json_reader_options const& reader_opts, - host_span h_data, - rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream) +rmm::device_uvector upload_data_to_device(json_reader_options const& reader_opts, + host_span h_data, + rmm::device_uvector& rec_starts, + rmm::cuda_stream_view stream) { size_t end_offset = h_data.size(); @@ -333,7 +343,7 @@ rmm::device_uvector reader_impl::upload_data_to_device( stream); } -std::pair, col_map_ptr_type> reader_impl::get_column_names_and_map( +std::pair, col_map_ptr_type> get_column_names_and_map( parse_options_view const& parse_opts, host_span h_data, device_span rec_starts, @@ -389,8 +399,8 @@ std::pair, col_map_ptr_type> reader_impl::get_column_na } } -std::vector reader_impl::parse_data_types( - std::vector const& column_names, std::vector const& types_as_strings) +std::vector parse_data_types(std::vector const& column_names, + std::vector const& types_as_strings) { CUDF_EXPECTS(types_as_strings.size() == column_names.size(), "Need to specify the type of each column.\n"); @@ -431,13 +441,13 @@ std::vector reader_impl::parse_data_types( return dtypes; } -std::vector reader_impl::get_data_types(json_reader_options const& reader_opts, - parse_options_view const& parse_opts, - std::vector const& column_names, - col_map_type* column_map, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream) +std::vector get_data_types(json_reader_options const& reader_opts, + parse_options_view const& parse_opts, + std::vector const& column_names, + col_map_type* column_map, + device_span rec_starts, + device_span data, + rmm::cuda_stream_view stream) { bool has_to_infer_column_types = std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); @@ -506,14 +516,14 @@ std::vector reader_impl::get_data_types(json_reader_options const& re } } -table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& parse_opts, - std::vector const& dtypes, - std::vector const& column_names, - col_map_type* column_map, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, + std::vector const& dtypes, + std::vector const& column_names, + col_map_type* column_map, + device_span rec_starts, + device_span data, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { const auto num_columns = dtypes.size(); const auto num_records = rec_starts.size(); @@ -596,11 +606,13 @@ table_with_metadata reader_impl::convert_data_to_table(parse_options_view const& * * @return Table and its metadata */ -table_with_metadata reader_impl::read(std::vector>& sources, - json_reader_options const& reader_opts, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +table_with_metadata read_json(std::vector>& sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(not sources.empty(), "No sources were defined"); + CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); auto parse_opts = parse_options{',', '\n', '\"', '.'}; @@ -653,17 +665,6 @@ table_with_metadata reader_impl::read(std::vector>& parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, d_data, stream, mr); } -table_with_metadata read_json(std::vector>& sources, - json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(not sources.empty(), "No sources were defined"); - - auto impl = std::make_unique(); - - return table_with_metadata{impl->read(sources, options, stream, mr)}; -} } // namespace json } // namespace detail } // namespace io diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp deleted file mode 100644 index 22c016c6613..00000000000 --- a/cpp/src/io/json/reader_impl.hpp +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file reader_impl.hpp - * @brief cuDF-IO JSON reader class implementation header - */ - -#pragma once - -#include "json_common.h" -#include "json_gpu.h" - -#include - -#include - -#include -#include -#include - -#include -#include -#include - -namespace cudf { -namespace io { -namespace detail { -namespace json { -using namespace cudf::io::json; -using namespace cudf::io; - -using col_map_type = cudf::io::json::gpu::col_map_type; -using col_map_ptr_type = std::unique_ptr>; - -/** - * @brief Class used to parse Json input and convert it into gdf columns. - */ -class reader_impl { - private: - /** - * @brief Ingest input JSON file/buffer, without decompression - * - * Sets the source_, byte_range_offset_, and byte_range_size_ data members - * - * @param[in] buffer Buffer to read the bytes in to - * @param[in] range_offset Number of bytes offset from the start - * @param[in] range_size Bytes to read; use `0` for all remaining data - * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data - */ - std::vector ingest_raw_input(std::vector> const& sources, - compression_type compression, - size_t range_offset, - size_t range_size, - size_t range_size_padded); - - /** - * @brief Extract the JSON objects keys from the input file with object rows. - * - * @return Array of keys and a map that maps their hash values to column indices - */ - std::pair, col_map_ptr_type> get_json_object_keys_hashes( - parse_options_view const& parse_opts, - host_span h_data, - device_span rec_starts, - device_span d_data, - rmm::cuda_stream_view stream); - - /** - * @brief Finds all record starts in the file. - * - * Does not upload the entire file to the GPU - * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * @return Record starts in the device memory - */ - rmm::device_uvector find_record_starts(json_reader_options const& reader_opts, - host_span h_data, - device_span d_data, - rmm::cuda_stream_view stream); - - /** - * @brief Uploads the relevant segment of the input json data onto the GPU. - * - * Sets the d_data_ data member. - * Only rows that need to be parsed are copied, based on the byte range - * Also updates the array of record starts to match the device data offset. - */ - rmm::device_uvector upload_data_to_device(json_reader_options const& reader_opts, - host_span h_data, - rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream); - - /** - * @brief Parse the first row to set the column name - * - * Sets the column_names_ data member - * - * @param[in] rec_starts Record starts in device memory - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ - std::pair, col_map_ptr_type> get_column_names_and_map( - parse_options_view const& parse_opts, - host_span h_data, - device_span rec_starts, - device_span d_data, - rmm::cuda_stream_view stream); - - std::vector parse_data_types(std::vector const& column_names, - std::vector const& types_as_strings); - - /** - * @brief Set the data type array data member - * - * If user does not pass the data types, deduces types from the file content - * - * @param[in] reader_opts Settings for controlling reading behavior - * @param[in] rec_starts Record starts in device memory - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ - std::vector get_data_types(json_reader_options const& reader_opts, - parse_options_view const& parse_opts, - std::vector const& column_names, - col_map_type* column_map, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream); - - /** - * @brief Parse the input data and store results a table - * - * @param[in] rec_starts Record starts in device memory - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * @param[in] mr Device memory resource to use for device memory allocation - * - * @return Table and its metadata - */ - table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, - std::vector const& dtypes, - std::vector const& column_names, - col_map_type* column_map, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - public: - /** - * @brief Read an entire set or a subset of data from the source - * - * @param[in] sources Input `datasource` objects to read the dataset from - * @param[in] options Settings for controlling reading behavior - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * @param[in] mr Device memory resource to use for device memory allocation - * - * @return Table and its metadata - */ - table_with_metadata read(std::vector>& sources, - json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); -}; - -} // namespace json -} // namespace detail -} // namespace io -} // namespace cudf From c658af184707d976385e02d6362bbdca5a315be2 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 23:14:41 -0500 Subject: [PATCH 26/32] remove deprected file header format --- cpp/src/io/json/reader_impl.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 745bfd40888..28954b413fa 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -14,11 +14,6 @@ * limitations under the License. */ -/** - * @file reader_impl.cu - * @brief cuDF-IO JSON reader class implementation - */ - #include "json_common.h" #include "json_gpu.h" From a1945afb0bfb247e4bed39ee535893badafb748c Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 20 Aug 2021 23:24:49 -0500 Subject: [PATCH 27/32] remove json_common.h --- cpp/src/io/json/json_common.h | 23 ----------------------- cpp/src/io/json/json_gpu.cu | 5 ++++- cpp/src/io/json/json_gpu.h | 2 +- cpp/src/io/json/reader_impl.cu | 2 +- 4 files changed, 6 insertions(+), 26 deletions(-) delete mode 100644 cpp/src/io/json/json_common.h diff --git a/cpp/src/io/json/json_common.h b/cpp/src/io/json/json_common.h deleted file mode 100644 index 803b937e58d..00000000000 --- a/cpp/src/io/json/json_common.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -using cudf::io::detail::string_index_pair; diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index d3930daefd2..ac47ef552dc 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "json_common.h" #include "json_gpu.h" #include +#include #include #include @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -511,6 +512,8 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, current = desc.value_end + 1; + using string_index_pair = thrust::pair; + // Empty fields are not legal values if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { // Type dispatcher does not handle strings diff --git a/cpp/src/io/json/json_gpu.h b/cpp/src/io/json/json_gpu.h index 7a6bce5e5a5..92024c3e8e6 100644 --- a/cpp/src/io/json/json_gpu.h +++ b/cpp/src/io/json/json_gpu.h @@ -16,8 +16,8 @@ #pragma once +#include #include -#include "json_common.h" #include diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 28954b413fa..a9d55a6f743 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include "json_common.h" #include "json_gpu.h" #include @@ -35,6 +34,7 @@ #include #include #include +#include #include #include From cf5867fe9c40346df658137fe9542a5a6302f071 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 25 Aug 2021 11:30:49 -0500 Subject: [PATCH 28/32] re-delete json reader_impl.hpp --- cpp/src/io/json/reader_impl.hpp | 205 -------------------------------- 1 file changed, 205 deletions(-) delete mode 100644 cpp/src/io/json/reader_impl.hpp diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp deleted file mode 100644 index 4d14edf360a..00000000000 --- a/cpp/src/io/json/reader_impl.hpp +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file reader_impl.hpp - * @brief cuDF-IO JSON reader class implementation header - */ - -#pragma once - -#include "json_common.h" -#include "json_gpu.h" - -#include - -#include - -#include -#include -#include - -#include -#include -#include - -namespace cudf { -namespace io { -namespace detail { -namespace json { -using namespace cudf::io::json; -using namespace cudf::io; - -using col_map_type = cudf::io::json::gpu::col_map_type; -using col_map_ptr_type = std::unique_ptr>; - -/** - * @brief Class used to parse Json input and convert it into gdf columns. - */ -class reader::impl { - public: - private: - const json_reader_options options_{}; - - rmm::mr::device_memory_resource* mr_ = nullptr; - - std::vector> sources_; - std::vector buffer_; - - const char* uncomp_data_ = nullptr; - size_t uncomp_size_ = 0; - - // Used when the input data is compressed, to ensure the allocated uncompressed data is freed - std::vector uncomp_data_owner_; - rmm::device_buffer data_; - - size_t byte_range_offset_ = 0; - size_t byte_range_size_ = 0; - bool load_whole_source_ = true; - - table_metadata metadata_; - std::vector dtypes_; - - // the map is only used for files with rows in object format; initialize to a dummy value so the - // map object can be passed to the kernel in any case - col_map_ptr_type key_to_col_idx_map_; - std::unique_ptr> d_key_col_map_; - - // parsing options - const bool allow_newlines_in_strings_ = false; - parse_options opts_{',', '\n', '\"', '.'}; - - /** - * @brief Sets the column map data member and makes a device copy to be used as a kernel - * parameter. - */ - void set_column_map(col_map_ptr_type&& map, rmm::cuda_stream_view stream) - { - key_to_col_idx_map_ = std::move(map); - d_key_col_map_ = - std::make_unique>(*key_to_col_idx_map_, stream); - } - /** - * @brief Gets the pointer to the column hash map in the device memory. - * - * Returns `nullptr` if the map is not created. - */ - auto get_column_map_device_ptr() - { - return key_to_col_idx_map_ ? d_key_col_map_->data() : nullptr; - } - - /** - * @brief Ingest input JSON file/buffer, without decompression - * - * Sets the source_, byte_range_offset_, and byte_range_size_ data members - * - * @param[in] range_offset Number of bytes offset from the start - * @param[in] range_size Bytes to read; use `0` for all remaining data - * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data - */ - void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded); - - /** - * @brief Extract the JSON objects keys from the input file with object rows. - * - * @return Array of keys and a map that maps their hash values to column indices - */ - std::pair, col_map_ptr_type> get_json_object_keys_hashes( - device_span rec_starts, rmm::cuda_stream_view stream); - - /** - * @brief Decompress the input data, if needed - * - * Sets the uncomp_data_ and uncomp_size_ data members - */ - void decompress_input(rmm::cuda_stream_view stream); - - /** - * @brief Finds all record starts in the file. - * - * Does not upload the entire file to the GPU - * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * @return Record starts in the device memory - */ - rmm::device_uvector find_record_starts(rmm::cuda_stream_view stream); - - /** - * @brief Uploads the relevant segment of the input json data onto the GPU. - * - * Sets the d_data_ data member. - * Only rows that need to be parsed are copied, based on the byte range - * Also updates the array of record starts to match the device data offset. - */ - void upload_data_to_device(rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream); - - /** - * @brief Parse the first row to set the column name - * - * Sets the column_names_ data member - * - * @param[in] rec_starts Record starts in device memory - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ - void set_column_names(device_span rec_starts, rmm::cuda_stream_view stream); - - /** - * @brief Set the data type array data member - * - * If user does not pass the data types, deduces types from the file content - * - * @param[in] rec_starts Record starts in device memory - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ - void set_data_types(device_span rec_starts, rmm::cuda_stream_view stream); - - /** - * @brief Parse the input data and store results a table - * - * @param[in] rec_starts Record starts in device memory - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @return Table and its metadata - */ - table_with_metadata convert_data_to_table(device_span rec_starts, - rmm::cuda_stream_view stream); - - public: - /** - * @brief Constructor from a dataset source with reader options. - */ - explicit impl(std::vector>&& sources, - json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Read an entire set or a subset of data from the source - * - * @param[in] options Settings for controlling reading behavior - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @return Table and its metadata - */ - table_with_metadata read(json_reader_options const& options, rmm::cuda_stream_view stream); -}; - -} // namespace json -} // namespace detail -} // namespace io -} // namespace cudf From bd69fbd40d2e490ade7f28376036210c11bf5343 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 25 Aug 2021 11:34:06 -0500 Subject: [PATCH 29/32] fix bad merge where changes in 9079 were deleted. --- cpp/src/io/json/reader_impl.cu | 58 +++++----------------------------- 1 file changed, 8 insertions(+), 50 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index dd3c14ad9ad..c23f1482234 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -26,7 +26,6 @@ #include #include -#include #include #include #include @@ -394,48 +393,6 @@ std::pair, col_map_ptr_type> get_column_names_and_map( } } -std::vector parse_data_types(std::vector const& column_names, - std::vector const& types_as_strings) -{ - CUDF_EXPECTS(types_as_strings.size() == column_names.size(), - "Need to specify the type of each column.\n"); - std::vector dtypes; - // Assume that the dtype is in dictionary format only if all elements contain a colon - const bool is_dict = std::all_of( - std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) { - return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); - }); - - auto split_on_colon = [](std::string_view s) { - auto const i = s.find(":"); - return std::pair{s.substr(0, i), s.substr(i + 1)}; - }; - - if (is_dict) { - std::map col_type_map; - std::transform( - std::cbegin(types_as_strings), - std::cend(types_as_strings), - std::inserter(col_type_map, col_type_map.end()), - [&](auto const& ts) { - auto const [col_name, type_str] = split_on_colon(ts); - return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})}; - }); - - // Using the map here allows O(n log n) complexity - std::transform(std::cbegin(column_names), - std::cend(column_names), - std::back_inserter(dtypes), - [&](auto const& column_name) { return col_type_map[column_name]; }); - } else { - std::transform(std::cbegin(types_as_strings), - std::cend(types_as_strings), - std::back_inserter(dtypes), - [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); - } - return dtypes; -} - std::vector get_data_types(json_reader_options const& reader_opts, parse_options_view const& parse_opts, std::vector const& column_names, @@ -449,11 +406,15 @@ std::vector get_data_types(json_reader_options const& reader_opts, if (!has_to_infer_column_types) { return std::visit(cudf::detail::visitor_overload{ - [&](const std::vector& dtypes) { return dtypes; }, + [&](const std::vector& dtypes) { + CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(), + "Must specify types for all columns"); + return dtypes; + }, [&](const std::map& dtypes) { std::vector sorted_dtypes; - std::transform(std::cbegin(column_names), - std::cend(column_names), + std::transform(std::cbegin(metadata_.column_names), + std::cend(metadata_.column_names), std::back_inserter(sorted_dtypes), [&](auto const& column_name) { auto const it = dtypes.find(column_name); @@ -462,11 +423,8 @@ std::vector get_data_types(json_reader_options const& reader_opts, return it->second; }); return sorted_dtypes; - }, - [&](std::vector const& dtypes) { - return parse_data_types(column_names, dtypes); }}, - reader_opts.get_dtypes()); + options_.get_dtypes()); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = column_names.size(); From 33d3cb7f7b183392cd43516d1e703ee76a78b8e3 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 25 Aug 2021 22:00:52 -0500 Subject: [PATCH 30/32] read_json: fix missing visitor_overload include --- cpp/src/io/json/reader_impl.cu | 40 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index c23f1482234..3f11c4ed7f2 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -405,26 +406,25 @@ std::vector get_data_types(json_reader_options const& reader_opts, std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); if (!has_to_infer_column_types) { - return std::visit(cudf::detail::visitor_overload{ - [&](const std::vector& dtypes) { - CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(), - "Must specify types for all columns"); - return dtypes; - }, - [&](const std::map& dtypes) { - std::vector sorted_dtypes; - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), - std::back_inserter(sorted_dtypes), - [&](auto const& column_name) { - auto const it = dtypes.find(column_name); - CUDF_EXPECTS(it != dtypes.end(), - "Must specify types for all columns"); - return it->second; - }); - return sorted_dtypes; - }}, - options_.get_dtypes()); + return std::visit( + cudf::detail::visitor_overload{ + [&](const std::vector& dtypes) { + CUDF_EXPECTS(dtypes.size() == column_names.size(), "Must specify types for all columns"); + return dtypes; + }, + [&](const std::map& dtypes) { + std::vector sorted_dtypes; + std::transform(std::cbegin(column_names), + std::cend(column_names), + std::back_inserter(sorted_dtypes), + [&](auto const& column_name) { + auto const it = dtypes.find(column_name); + CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); + return it->second; + }); + return sorted_dtypes; + }}, + reader_opts.get_dtypes()); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = column_names.size(); From 5e0ff9ae38c27d9db567ef46659a0b590481574e Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 11 Nov 2021 10:09:29 -0600 Subject: [PATCH 31/32] remove unnecessary doc comments and div-by-1 --- cpp/include/cudf/io/detail/json.hpp | 15 +++++---------- cpp/src/io/json/reader_impl.cu | 2 +- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 7ab8906e5a9..ca490b2619e 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -12,12 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ - -/** - * @file json.hpp - * @brief cuDF-IO reader classes API - */ + */\ #pragma once @@ -33,10 +28,10 @@ namespace json { /** * @brief Reads and returns the entire data set. * - * @param[in] sources Input `datasource` objects to read the dataset from - * @param[in] options Settings for controlling reading behavior - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * @param[in] mr Device memory resource to use for device memory allocation + * @param sources Input `datasource` objects to read the dataset from + * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation * * @return cudf::table object that contains the array of cudf::column. */ diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 0d819930ac9..319906111af 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -345,7 +345,7 @@ std::pair, col_map_ptr_type> get_column_names_and_map( rmm::cuda_stream_view stream) { // If file only contains one row, use the file size for the row size - uint64_t first_row_len = d_data.size() / sizeof(char); + uint64_t first_row_len = d_data.size(); if (rec_starts.size() > 1) { // Set first_row_len to the offset of the second row, if it exists CUDA_TRY(cudaMemcpyAsync(&first_row_len, From cf428385e9393c0c6b21a75238a02e7657f97ed6 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 11 Nov 2021 10:28:41 -0600 Subject: [PATCH 32/32] fix formatting issue --- cpp/include/cudf/io/detail/json.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index ca490b2619e..69b26a7b70a 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */\ + */ #pragma once