From 1dfd27d4c0e2e5912d19c6a1384ae5cd73d3531a Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 17 Sep 2020 02:54:49 +0000 Subject: [PATCH 01/52] add owning_buffer to datasource; add device_read path to parquet reader --- cpp/include/cudf/io/datasource.hpp | 32 ++++++++++++++++++++++++++++++ cpp/src/io/parquet/parquet_gpu.h | 2 +- cpp/src/io/parquet/reader_impl.cu | 21 +++++++++++--------- cpp/src/io/parquet/reader_impl.hpp | 2 +- 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index a99bac3f7f1..cd996a80653 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -56,6 +56,9 @@ class datasource { * @brief Base class destructor */ virtual ~buffer() {} + + template + static std::unique_ptr create(Container&& data_owner); }; /** @@ -211,8 +214,37 @@ class datasource { uint8_t* const _data; size_t const _size; }; + + template + class owning_buffer : public buffer { + public: + owning_buffer(Container&& data_owner) + : _owner(std::move(data_owner)), _data_ptr(_owner.data()), _size(_owner.size()) + { + } + // to create a view into an existing owning buffer + owning_buffer(Container&& data_owner, const uint8_t* data_ptr, size_t size) + : _owner(std::move(data_owner)), _data_ptr(data_ptr), _size(size) + { + } + + size_t size() const override { return _size; } + + const uint8_t* data() const override { return static_cast(_data_ptr); } + + private: + Container _data; + void const* const _data_ptr; + size_t const _size; + }; }; +template +std::unique_ptr datasource::buffer::create(Container&& data_owner) +{ + return std::make_unique>(std::move(data_owner)); +} + /** * @brief Implementation class for reading from an Apache Arrow file. The file * could be a memory-mapped file or other implementation supported by Arrow. diff --git a/cpp/src/io/parquet/parquet_gpu.h b/cpp/src/io/parquet/parquet_gpu.h index f42bbd0da57..756e71114f2 100644 --- a/cpp/src/io/parquet/parquet_gpu.h +++ b/cpp/src/io/parquet/parquet_gpu.h @@ -159,7 +159,7 @@ struct ColumnChunkDesc { { } - uint8_t *compressed_data; // pointer to compressed column chunk data + uint8_t const *compressed_data; // pointer to compressed column chunk data size_t compressed_size; // total compressed data size for this chunk size_t num_values; // total number of values in this column size_t start_row; // starting row of this chunk diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 8feb688946a..66c4af3dab7 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -531,7 +531,7 @@ class aggregate_metadata { * @copydoc cudf::io::detail::parquet::read_column_chunks */ void reader::impl::read_column_chunks( - std::vector &page_data, + std::vector> &page_data, hostdevice_vector &chunks, // TODO const? size_t begin_chunk, size_t end_chunk, @@ -559,9 +559,15 @@ void reader::impl::read_column_chunks( next_chunk++; } if (io_size != 0) { - auto buffer = _sources[chunk_source_map[chunk]]->host_read(io_offset, io_size); - page_data[chunk] = rmm::device_buffer(buffer->data(), buffer->size(), stream); - uint8_t *d_compdata = reinterpret_cast(page_data[chunk].data()); + auto &source = _sources[chunk_source_map[chunk]]; + if (!source->supports_device_read()) { + auto const buffer = source->host_read(io_offset, io_size); + page_data[chunk] = + datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream)); + } else { + page_data[chunk] = source->device_read(io_offset, io_size); + } + uint8_t const *d_compdata = page_data[chunk]->data(); do { chunks[chunk].compressed_data = d_compdata; d_compdata += chunks[chunk].compressed_size; @@ -1042,7 +1048,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, std::vector chunk_source_map(num_chunks); // Tracker for eventually deallocating compressed and uncompressed data - std::vector page_data(num_chunks); + std::vector> page_data(num_chunks); // Keep track of column chunk file offsets std::vector column_chunk_offsets(num_chunks); @@ -1147,10 +1153,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, decomp_page_data = decompress_page_data(chunks, pages, stream); // Free compressed data for (size_t c = 0; c < chunks.size(); c++) { - if (chunks[c].codec != parquet::Compression::UNCOMPRESSED && page_data[c].size() != 0) { - page_data[c].resize(0); - page_data[c].shrink_to_fit(); - } + if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { page_data[c].reset(); } } } diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 100b50062c9..cb1ccfa2098 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -90,7 +90,7 @@ class reader::impl { * @param stream CUDA stream used for device memory operations and kernel launches. * */ - void read_column_chunks(std::vector &page_data, + void read_column_chunks(std::vector> &page_data, hostdevice_vector &chunks, size_t begin_chunk, size_t end_chunk, From 5cfc08f885f3a5f54d821e039025a3c6564500b9 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Fri, 18 Sep 2020 06:10:55 +0000 Subject: [PATCH 02/52] name fix --- cpp/include/cudf/io/datasource.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index cd996a80653..8bc04a11510 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -219,12 +219,12 @@ class datasource { class owning_buffer : public buffer { public: owning_buffer(Container&& data_owner) - : _owner(std::move(data_owner)), _data_ptr(_owner.data()), _size(_owner.size()) + : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size()) { } // to create a view into an existing owning buffer owning_buffer(Container&& data_owner, const uint8_t* data_ptr, size_t size) - : _owner(std::move(data_owner)), _data_ptr(data_ptr), _size(size) + : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size) { } From eb0a0757410c2f6a6919b6023b575c6a4bfb07b3 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Fri, 18 Sep 2020 06:56:29 +0000 Subject: [PATCH 03/52] CMake change to find and link to cufile lib --- cpp/CMakeLists.txt | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bfd3abd1610..371d83f4a64 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -280,6 +280,27 @@ else() message(FATAL_ERROR "Boost not found, please check your settings.") endif(Boost_FOUND) +################################################################################################### +# - find cufile ----------------------------------------------------------------------------------- + +find_path(CUFILE_INCLUDE "cufile.h" + HINTS "${GDS_ROOT}/lib" + "/usr/local/gds/lib") + +find_library(CUFILE_LIBRARY "libcufile.so" + HINTS "${GDS_ROOT}/lib" + "/usr/local/gds/lib") + +message(STATUS "CUFILE: CUFILE_LIBRARY set to ${CUFILE_LIBRARY}") +message(STATUS "CUFILE: CUFILE_INCLUDE set to ${CUFILE_INCLUDE}") + +add_library(cufile SHARED IMPORTED ${CUFILE_LIBRARY}) +if (CUFILE_INCLUDE AND CUFILE_LIBRARY) + set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) +else() + message(FATAL_ERROR "cufile not found, please pass the GDS install directory using -DGDS_ROOT") +endif (CUFILE_INCLUDE AND CUFILE_LIBRARY) + ################################################################################################### # - RMM ------------------------------------------------------------------------------------------- @@ -373,7 +394,8 @@ include_directories("${CMAKE_BINARY_DIR}/include" "${ZLIB_INCLUDE_DIRS}" "${Boost_INCLUDE_DIRS}" "${RMM_INCLUDE}" - "${DLPACK_INCLUDE}") + "${DLPACK_INCLUDE}" + "${CUFILE_INCLUDE}") if(CONDA_INCLUDE_DIRS) include_directories("${CONDA_INCLUDE_DIRS}") @@ -385,7 +407,8 @@ endif(CONDA_INCLUDE_DIRS) link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc "${CMAKE_BINARY_DIR}/lib" "${CMAKE_BINARY_DIR}" - "${GTEST_LIBRARY_DIR}") + "${GTEST_LIBRARY_DIR}" + "${CUFILE_LIBRARY}") if(CONDA_LINK_DIRS) link_directories("${CONDA_LINK_DIRS}") From 3d00911e8a7380ea441cdfb99fc8025dd972d3fd Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 22 Sep 2020 16:20:31 +0000 Subject: [PATCH 04/52] link to cufile --- cpp/CMakeLists.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index dba09897e89..0154d2d7732 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -407,8 +407,7 @@ endif(CONDA_INCLUDE_DIRS) link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc "${CMAKE_BINARY_DIR}/lib" "${CMAKE_BINARY_DIR}" - "${GTEST_LIBRARY_DIR}" - "${CUFILE_LIBRARY}") + "${GTEST_LIBRARY_DIR}") if(CONDA_LINK_DIRS) link_directories("${CONDA_LINK_DIRS}") @@ -809,7 +808,7 @@ target_compile_definitions(cudf PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LOGGIN # - link libraries -------------------------------------------------------------------------------- # link targets for cuDF -target_link_libraries(cudf arrow arrow_cuda nvrtc ${CUDART_LIBRARY} cuda ${ZLIB_LIBRARIES} ${Boost_LIBRARIES}) +target_link_libraries(cudf arrow arrow_cuda nvrtc ${CUDART_LIBRARY} "${CUFILE_LIBRARY}" cuda ${ZLIB_LIBRARIES} ${Boost_LIBRARIES}) ################################################################################################### # - install targets ------------------------------------------------------------------------------- From c834cdcb8e94f74aaa1fecf9cf897c53ca35c79e Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 22 Sep 2020 17:24:08 +0000 Subject: [PATCH 05/52] basic device_read --- cpp/src/io/utilities/datasource.cpp | 79 ++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 4e1ae7f854b..e4c0c3665e2 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -20,12 +20,69 @@ #include #include +#include + +#include + #include #include namespace cudf { namespace io { +struct file_wrapper { + int const fd = -1; + explicit file_wrapper(const char *filepath, int oflags = O_RDONLY) : fd(open(filepath, oflags)) {} + ~file_wrapper() { close(fd); } +}; + +struct cufile_driver { + cufile_driver() + { + if (cuFileDriverOpen().err != CU_FILE_SUCCESS) throw "Cannot init cufile driver"; + } + ~cufile_driver() { cuFileDriverClose(); } +}; + +class gdsfile { + public: + gdsfile(const char *filepath) : handle(filepath, O_RDONLY | O_DIRECT) + { + static cufile_driver driver; + CUDF_EXPECTS(handle.fd != -1, "Cannot open file"); + + CUfileDescr_t cf_desc{}; + cf_desc.handle.fd = handle.fd; + cf_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; + CUDF_EXPECTS(cuFileHandleRegister(&cf_handle, &cf_desc).err == CU_FILE_SUCCESS, + "Cannot map cufile"); + + struct stat st; + CUDF_EXPECTS(fstat(handle.fd, &st) != -1, "Cannot query file size"); + } + + std::unique_ptr read(size_t offset, size_t size) + { + rmm::device_buffer out_data(size); + cuFileRead(cf_handle, out_data.data(), size, offset, 0); + + return datasource::buffer::create(std::move(out_data)); + } + + size_t read(size_t offset, size_t size, uint8_t *dst) + { + cuFileRead(cf_handle, dst, size, offset, 0); + // have to read the requested size for now + return size; + } + + ~gdsfile() { cuFileHandleDeregister(cf_handle); } + + private: + file_wrapper handle; + CUfileHandle_t cf_handle = nullptr; +}; + /** * @brief Implementation class for reading from a file or memory source using * memory mapped access. @@ -34,12 +91,6 @@ namespace io { * mapping a subset of the file where the starting offset may not be zero. */ class memory_mapped_source : public datasource { - struct file_wrapper { - const int fd = -1; - explicit file_wrapper(const char *filepath) : fd(open(filepath, O_RDONLY)) {} - ~file_wrapper() { close(fd); } - }; - class memory_mapped_buffer : public buffer { size_t _size = 0; uint8_t *_data = nullptr; @@ -52,6 +103,7 @@ class memory_mapped_source : public datasource { public: explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) + : _gds_file(filepath) { auto const file = file_wrapper(filepath); CUDF_EXPECTS(file.fd != -1, "Cannot open file"); @@ -91,6 +143,20 @@ class memory_mapped_source : public datasource { return read_size; } + bool supports_device_read() const override { return true; } + + std::unique_ptr device_read(size_t offset, size_t size) override + { + auto const read_size = std::min(size, map_size_ - (offset - map_offset_)); + return _gds_file.read(offset, size); + } + + size_t device_read(size_t offset, size_t size, uint8_t *dst) override + { + auto const read_size = std::min(size, map_size_ - (offset - map_offset_)); + return _gds_file.read(offset, size, dst); + } + size_t size() const override { return file_size_; } private: @@ -123,6 +189,7 @@ class memory_mapped_source : public datasource { void *map_addr_ = nullptr; size_t map_size_ = 0; size_t map_offset_ = 0; + gdsfile _gds_file; }; /** From 585d3a2b62a60e0d49109113937bd2d67110cc3d Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 22 Sep 2020 18:53:40 +0000 Subject: [PATCH 06/52] extract gds file into a separate source file --- cpp/CMakeLists.txt | 1 + cpp/src/io/utilities/data_sink.cpp | 12 ++++- cpp/src/io/utilities/file_utils.cpp | 74 +++++++++++++++++++++++++++++ cpp/src/io/utilities/file_utils.hpp | 61 ++++++++++++++++++++++++ 4 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 cpp/src/io/utilities/file_utils.cpp create mode 100644 cpp/src/io/utilities/file_utils.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fe2b9337bea..c5dfc979748 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -511,6 +511,7 @@ add_library(cudf src/io/functions.cpp src/io/statistics/column_stats.cu src/io/utilities/datasource.cpp + src/io/utilities/file_utils.cpp src/io/utilities/parsing_utils.cu src/io/utilities/type_conversion.cu src/io/utilities/data_sink.cpp diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 0a2a397f213..2ddfd2a8a6c 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -27,7 +27,7 @@ namespace io { */ class file_sink : public data_sink { public: - explicit file_sink(std::string const& filepath) + explicit file_sink(std::string const& filepath) : bytes_written_(0) { outfile_.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file"); @@ -42,10 +42,18 @@ class file_sink : public data_sink { void flush() override { outfile_.flush(); } - size_t bytes_written() override { return outfile_.tellp(); } + size_t bytes_written() override { return bytes_written_; } + + // bool supports_device_write() const override { return true; } + + // void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override + //{ + // bytes_written_ += size; + //} private: std::ofstream outfile_; + size_t bytes_written_; }; /** diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp new file mode 100644 index 00000000000..cddde54c7a1 --- /dev/null +++ b/cpp/src/io/utilities/file_utils.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include + +#include + +namespace cudf { +namespace io { + +file_wrapper::file_wrapper(const char *filepath, int oflags) : fd(open(filepath, oflags)) +{ + CUDF_EXPECTS(fd != -1, "Cannot open file"); +} + +file_wrapper::~file_wrapper() { close(fd); } + +size_t file_wrapper::size() const +{ + struct stat st; + CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); + return static_cast(st.st_size); +} + +gdsfile::gdsfile(const char *filepath) : file(filepath, O_RDONLY | O_DIRECT) +{ + static cufile_driver driver; + CUDF_EXPECTS(file.get_desc() != -1, "Cannot open file"); + + CUfileDescr_t cufile_desc{}; + cufile_desc.handle.fd = file.get_desc(); + cufile_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; + CUDF_EXPECTS(cuFileHandleRegister(&cufile_handle, &cufile_desc).err == CU_FILE_SUCCESS, + "Cannot map cufile"); + + struct stat st; + CUDF_EXPECTS(fstat(file.get_desc(), &st) != -1, "Cannot query file size"); +} + +std::unique_ptr gdsfile::read(size_t offset, size_t size) +{ + rmm::device_buffer out_data(size); + cuFileRead(cufile_handle, out_data.data(), size, offset, 0); + + return datasource::buffer::create(std::move(out_data)); +} + +size_t gdsfile::read(size_t offset, size_t size, uint8_t *dst) +{ + cuFileRead(cufile_handle, dst, size, offset, 0); + // have to read the requested size for now + return size; +} + +gdsfile::~gdsfile() { cuFileHandleDeregister(cufile_handle); } +}; // namespace io +}; // namespace cudf \ No newline at end of file diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp new file mode 100644 index 00000000000..24f79751ca4 --- /dev/null +++ b/cpp/src/io/utilities/file_utils.hpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace cudf { +namespace io { + +class file_wrapper { + int const fd = -1; + + public: + explicit file_wrapper(const char *filepath, int oflags); + ~file_wrapper(); + size_t size() const; + auto get_desc() const { return fd; } +}; + +struct cufile_driver { + cufile_driver() + { + if (cuFileDriverOpen().err != CU_FILE_SUCCESS) CUDF_FAIL("Cannot init cufile driver"); + } + ~cufile_driver() { cuFileDriverClose(); } +}; + +class gdsfile { + public: + gdsfile(const char *filepath); + + std::unique_ptr read(size_t offset, size_t size); + + size_t read(size_t offset, size_t size, uint8_t *dst); + + ~gdsfile(); + + private: + file_wrapper const file; + CUfileHandle_t cufile_handle = nullptr; +}; + +}; // namespace io +}; // namespace cudf \ No newline at end of file From be84443b18efae47399d6776f2ccf5ebe7f87eae Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 22 Sep 2020 19:13:02 +0000 Subject: [PATCH 07/52] missing file from previous commit --- cpp/src/io/utilities/datasource.cpp | 84 ++++------------------------- 1 file changed, 10 insertions(+), 74 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index e4c0c3665e2..47ceb0d8862 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,75 +14,18 @@ * limitations under the License. */ +#include + #include #include -#include -#include #include -#include - -#include - -#include #include +#include namespace cudf { namespace io { -struct file_wrapper { - int const fd = -1; - explicit file_wrapper(const char *filepath, int oflags = O_RDONLY) : fd(open(filepath, oflags)) {} - ~file_wrapper() { close(fd); } -}; - -struct cufile_driver { - cufile_driver() - { - if (cuFileDriverOpen().err != CU_FILE_SUCCESS) throw "Cannot init cufile driver"; - } - ~cufile_driver() { cuFileDriverClose(); } -}; - -class gdsfile { - public: - gdsfile(const char *filepath) : handle(filepath, O_RDONLY | O_DIRECT) - { - static cufile_driver driver; - CUDF_EXPECTS(handle.fd != -1, "Cannot open file"); - - CUfileDescr_t cf_desc{}; - cf_desc.handle.fd = handle.fd; - cf_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; - CUDF_EXPECTS(cuFileHandleRegister(&cf_handle, &cf_desc).err == CU_FILE_SUCCESS, - "Cannot map cufile"); - - struct stat st; - CUDF_EXPECTS(fstat(handle.fd, &st) != -1, "Cannot query file size"); - } - - std::unique_ptr read(size_t offset, size_t size) - { - rmm::device_buffer out_data(size); - cuFileRead(cf_handle, out_data.data(), size, offset, 0); - - return datasource::buffer::create(std::move(out_data)); - } - - size_t read(size_t offset, size_t size, uint8_t *dst) - { - cuFileRead(cf_handle, dst, size, offset, 0); - // have to read the requested size for now - return size; - } - - ~gdsfile() { cuFileHandleDeregister(cf_handle); } - - private: - file_wrapper handle; - CUfileHandle_t cf_handle = nullptr; -}; - /** * @brief Implementation class for reading from a file or memory source using * memory mapped access. @@ -105,14 +48,9 @@ class memory_mapped_source : public datasource { explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) : _gds_file(filepath) { - auto const file = file_wrapper(filepath); - CUDF_EXPECTS(file.fd != -1, "Cannot open file"); - - struct stat st; - CUDF_EXPECTS(fstat(file.fd, &st) != -1, "Cannot query file size"); - file_size_ = static_cast(st.st_size); - - if (file_size_ != 0) { map(file.fd, offset, size); } + auto const file = file_wrapper(filepath, O_RDONLY); + file_size_ = file.size(); + if (file_size_ != 0) { map(file.get_desc(), offset, size); } } virtual ~memory_mapped_source() @@ -165,7 +103,7 @@ class memory_mapped_source : public datasource { CUDF_EXPECTS(offset < file_size_, "Offset is past end of file"); // Offset for `mmap()` must be page aligned - auto const map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); + map_offset_ = offset & ~(sysconf(_SC_PAGESIZE) - 1); // Clamp length to available data in the file if (size == 0) { @@ -175,13 +113,11 @@ class memory_mapped_source : public datasource { } // Size for `mmap()` needs to include the page padding - const auto map_size = size + (offset - map_offset); + map_size_ = size + (offset - map_offset_); // Check if accessing a region within already mapped area - map_addr_ = mmap(NULL, map_size, PROT_READ, MAP_PRIVATE, fd, map_offset); + map_addr_ = mmap(nullptr, map_size_, PROT_READ, MAP_PRIVATE, fd, map_offset_); CUDF_EXPECTS(map_addr_ != MAP_FAILED, "Cannot create memory mapping"); - map_offset_ = map_offset; - map_size_ = map_size; } private: From 62135c6b7997dd3035e83308cac19f410a3da2ee Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 22 Sep 2020 22:09:07 +0000 Subject: [PATCH 08/52] gds sink --- cpp/src/io/utilities/data_sink.cpp | 19 +++++--- cpp/src/io/utilities/datasource.cpp | 4 +- cpp/src/io/utilities/file_utils.cpp | 75 ++++++++++++++++++++++------- cpp/src/io/utilities/file_utils.hpp | 39 +++++++++------ 4 files changed, 95 insertions(+), 42 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 2ddfd2a8a6c..a65f5b2b866 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -18,6 +18,7 @@ #include #include +#include namespace cudf { namespace io { @@ -27,7 +28,7 @@ namespace io { */ class file_sink : public data_sink { public: - explicit file_sink(std::string const& filepath) : bytes_written_(0) + explicit file_sink(std::string const& filepath) : _gds_file(filepath) { outfile_.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file"); @@ -37,23 +38,27 @@ class file_sink : public data_sink { void host_write(void const* data, size_t size) override { + outfile_.seekp(bytes_written_); outfile_.write(reinterpret_cast(data), size); + bytes_written_ += size; } void flush() override { outfile_.flush(); } size_t bytes_written() override { return bytes_written_; } - // bool supports_device_write() const override { return true; } + bool supports_device_write() const override { return true; } - // void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override - //{ - // bytes_written_ += size; - //} + void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override + { + _gds_file.write(gpu_data, bytes_written_, size); + bytes_written_ += size; + } private: std::ofstream outfile_; - size_t bytes_written_; + size_t bytes_written_ = 0; + gdsoutfile _gds_file; }; /** diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 47ceb0d8862..bb66ae852d5 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -50,7 +50,7 @@ class memory_mapped_source : public datasource { { auto const file = file_wrapper(filepath, O_RDONLY); file_size_ = file.size(); - if (file_size_ != 0) { map(file.get_desc(), offset, size); } + if (file_size_ != 0) { map(file.desc(), offset, size); } } virtual ~memory_mapped_source() @@ -125,7 +125,7 @@ class memory_mapped_source : public datasource { void *map_addr_ = nullptr; size_t map_size_ = 0; size_t map_offset_ = 0; - gdsfile _gds_file; + gdsinfile _gds_file; }; /** diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index cddde54c7a1..12deb910019 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -25,50 +25,89 @@ namespace cudf { namespace io { -file_wrapper::file_wrapper(const char *filepath, int oflags) : fd(open(filepath, oflags)) +file_wrapper::file_wrapper(std::string const &filepath, int flags) + : fd(open(filepath.c_str(), flags)) { CUDF_EXPECTS(fd != -1, "Cannot open file"); } +file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) + : fd(open(filepath.c_str(), flags, mode)) +{ + CUDF_EXPECTS(fd != -1, "Cannot open file"); +} + +struct cufile_driver { + cufile_driver() + { + if (cuFileDriverOpen().err != CU_FILE_SUCCESS) CUDF_FAIL("Cannot init cufile driver"); + } + ~cufile_driver() { cuFileDriverClose(); } +}; + +void init_cufile_driver() { static cufile_driver driver; } + file_wrapper::~file_wrapper() { close(fd); } -size_t file_wrapper::size() const +long file_wrapper::size() const { - struct stat st; - CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); - return static_cast(st.st_size); + if (_size < 0) { + struct stat st; + CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); + _size = static_cast(st.st_size); + } + return _size; } -gdsfile::gdsfile(const char *filepath) : file(filepath, O_RDONLY | O_DIRECT) +gdsinfile::gdsinfile(std::string const &filepath) : file(filepath, O_RDONLY | O_DIRECT) { - static cufile_driver driver; - CUDF_EXPECTS(file.get_desc() != -1, "Cannot open file"); + init_cufile_driver(); CUfileDescr_t cufile_desc{}; - cufile_desc.handle.fd = file.get_desc(); + cufile_desc.handle.fd = file.desc(); cufile_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; CUDF_EXPECTS(cuFileHandleRegister(&cufile_handle, &cufile_desc).err == CU_FILE_SUCCESS, - "Cannot map cufile"); - - struct stat st; - CUDF_EXPECTS(fstat(file.get_desc(), &st) != -1, "Cannot query file size"); + "Cannot register file handle with cuFile"); } -std::unique_ptr gdsfile::read(size_t offset, size_t size) +std::unique_ptr gdsinfile::read(size_t offset, size_t size) { rmm::device_buffer out_data(size); - cuFileRead(cufile_handle, out_data.data(), size, offset, 0); + CUDF_EXPECTS(cuFileRead(cufile_handle, out_data.data(), size, offset, 0) != -1, + "cuFile error reading from a file"); return datasource::buffer::create(std::move(out_data)); } -size_t gdsfile::read(size_t offset, size_t size, uint8_t *dst) +size_t gdsinfile::read(size_t offset, size_t size, uint8_t *dst) { - cuFileRead(cufile_handle, dst, size, offset, 0); + CUDF_EXPECTS(cuFileRead(cufile_handle, dst, size, offset, 0) != -1, + "cuFile error reading from a file"); // have to read the requested size for now return size; } -gdsfile::~gdsfile() { cuFileHandleDeregister(cufile_handle); } +gdsinfile::~gdsinfile() { cuFileHandleDeregister(cufile_handle); } + +gdsoutfile::gdsoutfile(std::string const &filepath) + : file(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) +{ + init_cufile_driver(); + + CUfileDescr_t cufile_desc{}; + cufile_desc.handle.fd = file.desc(); + cufile_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; + CUDF_EXPECTS(cuFileHandleRegister(&cufile_handle, &cufile_desc).err == CU_FILE_SUCCESS, + "Cannot register file handle with cuFile"); +} + +void gdsoutfile::write(void const *data, size_t offset, size_t size) +{ + CUDF_EXPECTS(cuFileWrite(cufile_handle, data, size, offset, 0) != -1, + "cuFile error writing to a file"); +} + +gdsoutfile::~gdsoutfile() { cuFileHandleDeregister(cufile_handle); } + }; // namespace io }; // namespace cudf \ No newline at end of file diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index 24f79751ca4..9cc470c22e1 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -16,6 +16,8 @@ #pragma once +#include + #include #include @@ -25,32 +27,39 @@ namespace cudf { namespace io { class file_wrapper { - int const fd = -1; + int const fd = -1; + long mutable _size = -1; public: - explicit file_wrapper(const char *filepath, int oflags); + explicit file_wrapper(std::string const &filepath, int flags); + explicit file_wrapper(std::string const &filepath, int flags, mode_t mode); ~file_wrapper(); - size_t size() const; - auto get_desc() const { return fd; } -}; - -struct cufile_driver { - cufile_driver() - { - if (cuFileDriverOpen().err != CU_FILE_SUCCESS) CUDF_FAIL("Cannot init cufile driver"); - } - ~cufile_driver() { cuFileDriverClose(); } + long size() const; + auto desc() const { return fd; } }; -class gdsfile { +class gdsinfile { public: - gdsfile(const char *filepath); + gdsinfile(std::string const &filepath); std::unique_ptr read(size_t offset, size_t size); size_t read(size_t offset, size_t size, uint8_t *dst); - ~gdsfile(); + ~gdsinfile(); + + private: + file_wrapper const file; + CUfileHandle_t cufile_handle = nullptr; +}; + +class gdsoutfile { + public: + gdsoutfile(std::string const &filepath); + + void write(void const *data, size_t offset, size_t size); + + ~gdsoutfile(); private: file_wrapper const file; From f70974e2d45907437e70745b49d9aba5b36c0080 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 22 Sep 2020 23:43:23 +0000 Subject: [PATCH 09/52] refactor cufile RAII into a separate struct --- cpp/src/io/utilities/data_sink.cpp | 2 +- cpp/src/io/utilities/datasource.cpp | 2 +- cpp/src/io/utilities/file_utils.cpp | 40 +++++++++++++---------------- cpp/src/io/utilities/file_utils.hpp | 22 ++++++++-------- 4 files changed, 32 insertions(+), 34 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index a65f5b2b866..f0f300d4726 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -58,7 +58,7 @@ class file_sink : public data_sink { private: std::ofstream outfile_; size_t bytes_written_ = 0; - gdsoutfile _gds_file; + gds_output _gds_file; }; /** diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index bb66ae852d5..033ea49fa02 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -125,7 +125,7 @@ class memory_mapped_source : public datasource { void *map_addr_ = nullptr; size_t map_size_ = 0; size_t map_offset_ = 0; - gdsinfile _gds_file; + gds_input _gds_file; }; /** diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 12deb910019..3251aeefdc5 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -59,55 +59,51 @@ long file_wrapper::size() const return _size; } -gdsinfile::gdsinfile(std::string const &filepath) : file(filepath, O_RDONLY | O_DIRECT) +cf_file_wrapper::cf_file_wrapper(int fd) { init_cufile_driver(); CUfileDescr_t cufile_desc{}; - cufile_desc.handle.fd = file.desc(); + cufile_desc.handle.fd = fd; cufile_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; - CUDF_EXPECTS(cuFileHandleRegister(&cufile_handle, &cufile_desc).err == CU_FILE_SUCCESS, + CUDF_EXPECTS(cuFileHandleRegister(&handle, &cufile_desc).err == CU_FILE_SUCCESS, "Cannot register file handle with cuFile"); } -std::unique_ptr gdsinfile::read(size_t offset, size_t size) +cf_file_wrapper::~cf_file_wrapper() { cuFileHandleDeregister(handle); } + +gds_input::gds_input(std::string const &filepath) + : file(filepath, O_RDONLY | O_DIRECT), cf_file{file.desc()} +{ +} + +std::unique_ptr gds_input::read(size_t offset, size_t size) { rmm::device_buffer out_data(size); - CUDF_EXPECTS(cuFileRead(cufile_handle, out_data.data(), size, offset, 0) != -1, + CUDF_EXPECTS(cuFileRead(cf_file.handle, out_data.data(), size, offset, 0) != -1, "cuFile error reading from a file"); return datasource::buffer::create(std::move(out_data)); } -size_t gdsinfile::read(size_t offset, size_t size, uint8_t *dst) +size_t gds_input::read(size_t offset, size_t size, uint8_t *dst) { - CUDF_EXPECTS(cuFileRead(cufile_handle, dst, size, offset, 0) != -1, + CUDF_EXPECTS(cuFileRead(cf_file.handle, dst, size, offset, 0) != -1, "cuFile error reading from a file"); // have to read the requested size for now return size; } -gdsinfile::~gdsinfile() { cuFileHandleDeregister(cufile_handle); } - -gdsoutfile::gdsoutfile(std::string const &filepath) - : file(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) +gds_output::gds_output(std::string const &filepath) + : file(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664), cf_file(file.desc()) { - init_cufile_driver(); - - CUfileDescr_t cufile_desc{}; - cufile_desc.handle.fd = file.desc(); - cufile_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; - CUDF_EXPECTS(cuFileHandleRegister(&cufile_handle, &cufile_desc).err == CU_FILE_SUCCESS, - "Cannot register file handle with cuFile"); } -void gdsoutfile::write(void const *data, size_t offset, size_t size) +void gds_output::write(void const *data, size_t offset, size_t size) { - CUDF_EXPECTS(cuFileWrite(cufile_handle, data, size, offset, 0) != -1, + CUDF_EXPECTS(cuFileWrite(cf_file.handle, data, size, offset, 0) != -1, "cuFile error writing to a file"); } -gdsoutfile::~gdsoutfile() { cuFileHandleDeregister(cufile_handle); } - }; // namespace io }; // namespace cudf \ No newline at end of file diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index 9cc470c22e1..b1acd3aacc2 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -38,32 +38,34 @@ class file_wrapper { auto desc() const { return fd; } }; -class gdsinfile { +struct cf_file_wrapper { + CUfileHandle_t handle = nullptr; + explicit cf_file_wrapper(int fd); + ~cf_file_wrapper(); +}; + +class gds_input { public: - gdsinfile(std::string const &filepath); + gds_input(std::string const &filepath); std::unique_ptr read(size_t offset, size_t size); size_t read(size_t offset, size_t size, uint8_t *dst); - ~gdsinfile(); - private: file_wrapper const file; - CUfileHandle_t cufile_handle = nullptr; + cf_file_wrapper const cf_file; }; -class gdsoutfile { +class gds_output { public: - gdsoutfile(std::string const &filepath); + gds_output(std::string const &filepath); void write(void const *data, size_t offset, size_t size); - ~gdsoutfile(); - private: file_wrapper const file; - CUfileHandle_t cufile_handle = nullptr; + cf_file_wrapper const cf_file; }; }; // namespace io From 2acfeca7a6d78b7bb61c76dadadfc4ffe08d090a Mon Sep 17 00:00:00 2001 From: sft-managed Date: Fri, 2 Oct 2020 18:39:33 +0000 Subject: [PATCH 10/52] refactor gds threshold logic --- cpp/include/cudf/io/data_sink.hpp | 2 ++ cpp/include/cudf/io/datasource.hpp | 2 ++ cpp/src/io/parquet/reader_impl.cu | 6 ++--- cpp/src/io/parquet/writer_impl.cu | 11 ++------- cpp/src/io/utilities/data_sink.cpp | 5 ++++ cpp/src/io/utilities/datasource.cpp | 2 ++ cpp/src/io/utilities/file_utils.cpp | 7 ++---- cpp/src/io/utilities/file_utils.hpp | 36 +++++++++++++++++++++-------- 8 files changed, 44 insertions(+), 27 deletions(-) diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index 9f16ffa3105..ea299414967 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -105,6 +105,8 @@ class data_sink { **/ virtual bool supports_device_write() const { return false; } + virtual bool is_device_write_preferred(size_t size) const { return supports_device_write(); } + /** * @brief Append the buffer content to the sink from a gpu address * diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 8bc04a11510..caeda7a0c99 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -150,6 +150,8 @@ class datasource { */ virtual bool supports_device_read() const { return false; } + virtual bool is_device_read_preferred(size_t size) const { return supports_device_read(); } + /** * @brief Returns a device buffer with a subset of data from the source. * diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 66c4af3dab7..c4e4b160528 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -560,12 +560,12 @@ void reader::impl::read_column_chunks( } if (io_size != 0) { auto &source = _sources[chunk_source_map[chunk]]; - if (!source->supports_device_read()) { + if (source->is_device_read_preferred(io_size)) { + page_data[chunk] = source->device_read(io_offset, io_size); + } else { auto const buffer = source->host_read(io_offset, io_size); page_data[chunk] = datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream)); - } else { - page_data[chunk] = source->device_read(io_offset, io_size); } uint8_t const *d_compdata = page_data[chunk]->data(); do { diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f445e1c1e80..ffe32337df2 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -832,19 +832,12 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) state.stream); } - auto host_bfr = [&]() { - // if the writer supports device_write(), we don't need this scratch space - if (out_sink_->supports_device_write()) { - return pinned_buffer{nullptr, cudaFreeHost}; - } else { - return pinned_buffer{[](size_t size) { + auto host_bfr = pinned_buffer{[](size_t size) { uint8_t *ptr = nullptr; CUDA_TRY(cudaMallocHost(&ptr, size)); return ptr; }(max_chunk_bfr_size), cudaFreeHost}; - } - }(); // Encode row groups in batches for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size(); @@ -880,7 +873,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) dev_bfr = ck->uncompressed_bfr; } - if (out_sink_->supports_device_write()) { + if (out_sink_->is_device_write_preferred(ck->compressed_size)) { // let the writer do what it wants to retrieve the data from the gpu. out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, state.stream); // we still need to do a (much smaller) memcpy for the statistics. diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index f0f300d4726..86341af49e5 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -49,6 +49,11 @@ class file_sink : public data_sink { bool supports_device_write() const override { return true; } + bool is_device_write_preferred(size_t size) const override + { + return _gds_file.is_gds_io_preferred(size); + } + void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override { _gds_file.write(gpu_data, bytes_written_, size); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 033ea49fa02..f348a74007c 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -83,6 +83,8 @@ class memory_mapped_source : public datasource { bool supports_device_read() const override { return true; } + bool is_device_read_preferred(size_t size) const { return _gds_file.is_gds_io_preferred(size); } + std::unique_ptr device_read(size_t offset, size_t size) override { auto const read_size = std::min(size, map_size_ - (offset - map_offset_)); diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 3251aeefdc5..8e085673420 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -72,10 +72,7 @@ cf_file_wrapper::cf_file_wrapper(int fd) cf_file_wrapper::~cf_file_wrapper() { cuFileHandleDeregister(handle); } -gds_input::gds_input(std::string const &filepath) - : file(filepath, O_RDONLY | O_DIRECT), cf_file{file.desc()} -{ -} +gds_input::gds_input(std::string const &filepath) : gds_io_base(filepath, O_RDONLY | O_DIRECT) {} std::unique_ptr gds_input::read(size_t offset, size_t size) { @@ -95,7 +92,7 @@ size_t gds_input::read(size_t offset, size_t size, uint8_t *dst) } gds_output::gds_output(std::string const &filepath) - : file(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664), cf_file(file.desc()) + : gds_io_base(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) { } diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index b1acd3aacc2..1df159af605 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -44,28 +44,44 @@ struct cf_file_wrapper { ~cf_file_wrapper(); }; -class gds_input { +class gds_io_base { + public: + gds_io_base(std::string const &filepath, int flags) : file(filepath, flags), cf_file{file.desc()} + { + } + gds_io_base(std::string const &filepath, int flags, mode_t mode) + : file(filepath, flags, mode), cf_file{file.desc()} + { + } + + static bool is_gds_io_preferred(size_t size) { return size > op_size_threshold; } + + protected: + /** + * @brief The read/write size above which GDS is faster then host read + copy + * + * This may not be the optimal threshold for all systems. `is_gds_io_preferred` can use a + * different logic based on the system config. + */ + static constexpr size_t op_size_threshold = 128 << 10; + file_wrapper const file; + cf_file_wrapper const cf_file; +}; + +class gds_input : public gds_io_base { public: gds_input(std::string const &filepath); std::unique_ptr read(size_t offset, size_t size); size_t read(size_t offset, size_t size, uint8_t *dst); - - private: - file_wrapper const file; - cf_file_wrapper const cf_file; }; -class gds_output { +class gds_output : public gds_io_base { public: gds_output(std::string const &filepath); void write(void const *data, size_t offset, size_t size); - - private: - file_wrapper const file; - cf_file_wrapper const cf_file; }; }; // namespace io From 73092fcf66edf8374d2db441543eec8aa6069ca9 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 7 Oct 2020 18:13:38 +0000 Subject: [PATCH 11/52] override the cufile config file to always enable compatibility mode --- cpp/CMakeLists.txt | 7 ++- cpp/config/cufile.json | 70 +++++++++++++++++++++++++++++ cpp/src/io/utilities/file_utils.cpp | 22 ++++++++- 3 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 cpp/config/cufile.json diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bcc7ef7535e..d455103969a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -296,7 +296,9 @@ message(STATUS "CUFILE: CUFILE_INCLUDE set to ${CUFILE_INCLUDE}") add_library(cufile SHARED IMPORTED ${CUFILE_LIBRARY}) if (CUFILE_INCLUDE AND CUFILE_LIBRARY) - set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) + set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/config/cufile.json" + "${CMAKE_BINARY_DIR}/config/cufile.json") else() message(FATAL_ERROR "cufile not found, please pass the GDS install directory using -DGDS_ROOT") endif (CUFILE_INCLUDE AND CUFILE_LIBRARY) @@ -820,6 +822,9 @@ install(TARGETS cudf install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/cudf DESTINATION include COMPONENT cudf) +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/config/cufile.json + DESTINATION lib + COMPONENT cudf) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/cudf_test DESTINATION include COMPONENT cudf) diff --git a/cpp/config/cufile.json b/cpp/config/cufile.json new file mode 100644 index 00000000000..7ad2d29bfa9 --- /dev/null +++ b/cpp/config/cufile.json @@ -0,0 +1,70 @@ +{ + "logging": { + // log directory, if not enabled will create log file under current working directory + //"dir": "/home/", + + // ERROR|WARN|INFO|DEBUG|TRACE (in decreasing order of priority) + "level": "ERROR" + }, + + "profile": { + // nvtx profiling on/off + "nvtx": false, + // cufile stats level(0-3) + "cufile_stats": 0 + }, + + "properties": { + // max IO chunk size (4K aligned) used by cuFile per IO request (in KB) + "max_direct_io_size_kb" : 16384, + // device memory size (4K aligned) for reserving bounce buffers for the entire GPU (in KB) + "max_device_cache_size_kb" : 131072, + // limit on maximum memory (4K aligned) that can be pinned for a given process (in KB) + "max_device_pinned_mem_size_kb" : 33554432, + // true or false (true will enable asynchronous io submission to nvidia-fs driver) + "use_poll_mode" : false, + // maximum IO request size (4K aligned) within or equal to which library will poll (in KB) + "poll_mode_max_size_kb": 4, + // allow compat mode, this will enable use of cufile posix read/writes + "allow_compat_mode": true, + // client-side rdma addr list for user-space file-systems(e.g ["10.0.1.0", "10.0.2.0"]) + "rdma_dev_addr_list": [ + "172.16.8.240","172.16.8.241", + "172.16.8.242","172.16.8.243", + "172.16.8.245","172.16.8.246", + "172.16.8.247","172.16.8.248" + ] + }, + + "fs": { + "generic": { + + // for unaligned writes, setting it to true will use posix write instead of cuFileWrite + "posix_unaligned_writes" : false + }, + + "lustre": { + + // IO threshold for read/write (4K aligned)) equal to or below which cufile will use posix reads (KB) + "posix_gds_min_kb" : 0 + } + }, + + "blacklist": { + // specify list of vendor driver modules to blacklist for nvidia-fs (e.g. ["nvme" , "nvme_rdma"]) + "drivers": [ ], + + // specify list of block devices to prevent IO using libcufile (e.g. [ "/dev/sda1" ]) + "devices": [ ], + + // specify list of mount points to prevent IO using libcufile (e.g. ["/mnt/test"]) + "mounts": [ ], + + // specify list of file-systems to prevent IO using libcufile (e.g ["lustre", "wekafs"]) + "filesystems": [ ] + } + + // Application can override custom configuration via export CUFILE_ENV_PATH_JSON= + // e.g : export CUFILE_ENV_PATH_JSON="/home//cufile.json" +} + diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 8e085673420..075e093c107 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -15,7 +15,9 @@ */ #include +#include #include +#include #include #include #include @@ -37,10 +39,28 @@ file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) CUDF_EXPECTS(fd != -1, "Cannot open file"); } +/** + * Returns the directory from which the libcudf.so is loaded. + */ +std::string get_libcudf_dir_path() +{ + Dl_info dl_info; + dladdr((void *)get_libcudf_path, &dl_info); + std::string full_path{dl_info.dli_fname}; + auto const dir_path = full_path.substr(0, full_path.find_last_of('/') + 1); + return dir_path; +} + struct cufile_driver { cufile_driver() { - if (cuFileDriverOpen().err != CU_FILE_SUCCESS) CUDF_FAIL("Cannot init cufile driver"); + // Unless CUFILE_ENV_PATH_JSON is already set, set the env var to point to a config file with + // enabled compatiblity mode + auto const cufile_config_path = get_libcudf_dir_path() + "config/cufile.json"; + CUDF_EXPECTS(setenv("CUFILE_ENV_PATH_JSON", cufile_config_path.c_str(), 0) == 0, + "Failed to set the cuFile config file environment variable."); + + CUDF_EXPECTS(cuFileDriverOpen().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver"); } ~cufile_driver() { cuFileDriverClose(); } }; From 2aa7da78754eecbf6ac650a69ff92d029ba3506b Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 7 Oct 2020 18:23:21 +0000 Subject: [PATCH 12/52] fix missed rename --- cpp/src/io/utilities/file_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 075e093c107..44d43459c60 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -45,7 +45,7 @@ file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) std::string get_libcudf_dir_path() { Dl_info dl_info; - dladdr((void *)get_libcudf_path, &dl_info); + dladdr((void *)get_libcudf_dir_path, &dl_info); std::string full_path{dl_info.dli_fname}; auto const dir_path = full_path.substr(0, full_path.find_last_of('/') + 1); return dir_path; From 689a4d2ebe4a10ae3ec078768cef5cd907884d41 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 7 Oct 2020 22:09:41 +0000 Subject: [PATCH 13/52] add docs; rename data members --- cpp/src/io/utilities/data_sink.cpp | 38 ++++++++-------- cpp/src/io/utilities/datasource.cpp | 59 ++++++++++++------------ cpp/src/io/utilities/file_utils.cpp | 34 ++++++++++---- cpp/src/io/utilities/file_utils.hpp | 70 +++++++++++++++++++++++------ 4 files changed, 131 insertions(+), 70 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 86341af49e5..163080b03e1 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -28,42 +28,42 @@ namespace io { */ class file_sink : public data_sink { public: - explicit file_sink(std::string const& filepath) : _gds_file(filepath) + explicit file_sink(std::string const& filepath) : _cufile_out(filepath) { - outfile_.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); - CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file"); + _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); + CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file"); } virtual ~file_sink() { flush(); } void host_write(void const* data, size_t size) override { - outfile_.seekp(bytes_written_); - outfile_.write(reinterpret_cast(data), size); - bytes_written_ += size; + _output_stream.seekp(_bytes_written); + _output_stream.write(reinterpret_cast(data), size); + _bytes_written += size; } - void flush() override { outfile_.flush(); } + void flush() override { _output_stream.flush(); } - size_t bytes_written() override { return bytes_written_; } + size_t bytes_written() override { return _bytes_written; } bool supports_device_write() const override { return true; } bool is_device_write_preferred(size_t size) const override { - return _gds_file.is_gds_io_preferred(size); + return _cufile_out.is_cufile_io_preferred(size); } void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override { - _gds_file.write(gpu_data, bytes_written_, size); - bytes_written_ += size; + _cufile_out.write(gpu_data, _bytes_written, size); + _bytes_written += size; } private: - std::ofstream outfile_; - size_t bytes_written_ = 0; - gds_output _gds_file; + std::ofstream _output_stream; + size_t _bytes_written = 0; + cufile_output _cufile_out; }; /** @@ -96,25 +96,25 @@ class host_buffer_sink : public data_sink { */ class void_sink : public data_sink { public: - explicit void_sink() : bytes_written_(0) {} + explicit void_sink() : _bytes_written(0) {} virtual ~void_sink() {} - void host_write(void const* data, size_t size) override { bytes_written_ += size; } + void host_write(void const* data, size_t size) override { _bytes_written += size; } bool supports_device_write() const override { return true; } void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override { - bytes_written_ += size; + _bytes_written += size; } void flush() override {} - size_t bytes_written() override { return bytes_written_; } + size_t bytes_written() override { return _bytes_written; } private: - size_t bytes_written_; + size_t _bytes_written; }; class user_sink_wrapper : public data_sink { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index f348a74007c..62b305fde7e 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -46,88 +46,91 @@ class memory_mapped_source : public datasource { public: explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) - : _gds_file(filepath) + : _cufile_in(filepath) { auto const file = file_wrapper(filepath, O_RDONLY); - file_size_ = file.size(); - if (file_size_ != 0) { map(file.desc(), offset, size); } + _file_size = file.size(); + if (_file_size != 0) { map(file.desc(), offset, size); } } virtual ~memory_mapped_source() { - if (map_addr_ != nullptr) { munmap(map_addr_, map_size_); } + if (_map_addr != nullptr) { munmap(_map_addr, _map_size); } } std::unique_ptr host_read(size_t offset, size_t size) override { - CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping"); + CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); // Clamp length to available data in the mapped region - auto const read_size = std::min(size, map_size_ - (offset - map_offset_)); + auto const read_size = std::min(size, _map_size - (offset - _map_offset)); return std::make_unique( - static_cast(map_addr_) + (offset - map_offset_), read_size); + static_cast(_map_addr) + (offset - _map_offset), read_size); } size_t host_read(size_t offset, size_t size, uint8_t *dst) override { - CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping"); + CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); // Clamp length to available data in the mapped region - auto const read_size = std::min(size, map_size_ - (offset - map_offset_)); + auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - auto const src = static_cast(map_addr_) + (offset - map_offset_); + auto const src = static_cast(_map_addr) + (offset - _map_offset); std::memcpy(dst, src, read_size); return read_size; } bool supports_device_read() const override { return true; } - bool is_device_read_preferred(size_t size) const { return _gds_file.is_gds_io_preferred(size); } + bool is_device_read_preferred(size_t size) const + { + return _cufile_in.is_cufile_io_preferred(size); + } std::unique_ptr device_read(size_t offset, size_t size) override { - auto const read_size = std::min(size, map_size_ - (offset - map_offset_)); - return _gds_file.read(offset, size); + auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + return _cufile_in.read(offset, size); } size_t device_read(size_t offset, size_t size, uint8_t *dst) override { - auto const read_size = std::min(size, map_size_ - (offset - map_offset_)); - return _gds_file.read(offset, size, dst); + auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + return _cufile_in.read(offset, size, dst); } - size_t size() const override { return file_size_; } + size_t size() const override { return _file_size; } private: void map(int fd, size_t offset, size_t size) { - CUDF_EXPECTS(offset < file_size_, "Offset is past end of file"); + CUDF_EXPECTS(offset < _file_size, "Offset is past end of file"); // Offset for `mmap()` must be page aligned - map_offset_ = offset & ~(sysconf(_SC_PAGESIZE) - 1); + _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); // Clamp length to available data in the file if (size == 0) { - size = file_size_ - offset; + size = _file_size - offset; } else { - if ((offset + size) > file_size_) { size = file_size_ - offset; } + if ((offset + size) > _file_size) { size = _file_size - offset; } } // Size for `mmap()` needs to include the page padding - map_size_ = size + (offset - map_offset_); + _map_size = size + (offset - _map_offset); // Check if accessing a region within already mapped area - map_addr_ = mmap(nullptr, map_size_, PROT_READ, MAP_PRIVATE, fd, map_offset_); - CUDF_EXPECTS(map_addr_ != MAP_FAILED, "Cannot create memory mapping"); + _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset); + CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping"); } private: - size_t file_size_ = 0; - void *map_addr_ = nullptr; - size_t map_size_ = 0; - size_t map_offset_ = 0; - gds_input _gds_file; + size_t _file_size = 0; + void *_map_addr = nullptr; + size_t _map_size = 0; + size_t _map_offset = 0; + cufile_input _cufile_in; }; /** diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 44d43459c60..36f69d02f4e 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -44,13 +44,19 @@ file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) */ std::string get_libcudf_dir_path() { - Dl_info dl_info; + Dl_info dl_info{}; dladdr((void *)get_libcudf_dir_path, &dl_info); std::string full_path{dl_info.dli_fname}; auto const dir_path = full_path.substr(0, full_path.find_last_of('/') + 1); return dir_path; } +/** + * @brief Class that provides RAII for cuFile driver management. + * + * Should be used as a singleton. Sets the environment path to point to cudf cuFile config file + * (enables compatilibity mode). + */ struct cufile_driver { cufile_driver() { @@ -65,6 +71,11 @@ struct cufile_driver { ~cufile_driver() { cuFileDriverClose(); } }; +/** + * @brief Initializes the cuFile driver. + * + * Needs to be called before any cuFile operation. + */ void init_cufile_driver() { static cufile_driver driver; } file_wrapper::~file_wrapper() { close(fd); } @@ -79,7 +90,7 @@ long file_wrapper::size() const return _size; } -cf_file_wrapper::cf_file_wrapper(int fd) +cufile_registered_file::cufile_registered_file(int fd) { init_cufile_driver(); @@ -90,11 +101,15 @@ cf_file_wrapper::cf_file_wrapper(int fd) "Cannot register file handle with cuFile"); } -cf_file_wrapper::~cf_file_wrapper() { cuFileHandleDeregister(handle); } +cufile_registered_file::~cufile_registered_file() { cuFileHandleDeregister(handle); } -gds_input::gds_input(std::string const &filepath) : gds_io_base(filepath, O_RDONLY | O_DIRECT) {} +cufile_input::cufile_input(std::string const &filepath) + : cufile_io_base(filepath, O_RDONLY | O_DIRECT) +{ + init_cufile_driver(); +} -std::unique_ptr gds_input::read(size_t offset, size_t size) +std::unique_ptr cufile_input::read(size_t offset, size_t size) { rmm::device_buffer out_data(size); CUDF_EXPECTS(cuFileRead(cf_file.handle, out_data.data(), size, offset, 0) != -1, @@ -103,7 +118,7 @@ std::unique_ptr gds_input::read(size_t offset, size_t size) return datasource::buffer::create(std::move(out_data)); } -size_t gds_input::read(size_t offset, size_t size, uint8_t *dst) +size_t cufile_input::read(size_t offset, size_t size, uint8_t *dst) { CUDF_EXPECTS(cuFileRead(cf_file.handle, dst, size, offset, 0) != -1, "cuFile error reading from a file"); @@ -111,12 +126,13 @@ size_t gds_input::read(size_t offset, size_t size, uint8_t *dst) return size; } -gds_output::gds_output(std::string const &filepath) - : gds_io_base(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) +cufile_output::cufile_output(std::string const &filepath) + : cufile_io_base(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) { + init_cufile_driver(); } -void gds_output::write(void const *data, size_t offset, size_t size) +void cufile_output::write(void const *data, size_t offset, size_t size) { CUDF_EXPECTS(cuFileWrite(cf_file.handle, data, size, offset, 0) != -1, "cuFile error writing to a file"); diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index 1df159af605..8f6776ff96d 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -26,6 +26,9 @@ namespace cudf { namespace io { +/** + * @brief Class that provides RAII for file handling. + */ class file_wrapper { int const fd = -1; long mutable _size = -1; @@ -38,49 +41,88 @@ class file_wrapper { auto desc() const { return fd; } }; -struct cf_file_wrapper { +/** + * @brief Class that provides RAII for cuFile file registration. + */ +struct cufile_registered_file { CUfileHandle_t handle = nullptr; - explicit cf_file_wrapper(int fd); - ~cf_file_wrapper(); + explicit cufile_registered_file(int fd); + ~cufile_registered_file(); }; -class gds_io_base { +/** + * @brief Base class for cuFile input/output. + * + * Contains the file handles and common API for cuFile input and output classes. + */ +class cufile_io_base { public: - gds_io_base(std::string const &filepath, int flags) : file(filepath, flags), cf_file{file.desc()} + cufile_io_base(std::string const &filepath, int flags) + : file(filepath, flags), cf_file{file.desc()} { } - gds_io_base(std::string const &filepath, int flags, mode_t mode) + cufile_io_base(std::string const &filepath, int flags, mode_t mode) : file(filepath, flags, mode), cf_file{file.desc()} { } - static bool is_gds_io_preferred(size_t size) { return size > op_size_threshold; } + virtual ~cufile_io_base() = default; + + /** + * @brief Returns an estimate of whether the cuFile operation is the optimal option. + * + * @param size Read/write operation size, in bytes. + * @return Whether a cuFile operation with the given size is expected to be faster than a host + * read + H2D copy + */ + static bool is_cufile_io_preferred(size_t size) { return size > op_size_threshold; } protected: /** - * @brief The read/write size above which GDS is faster then host read + copy + * @brief The read/write size above which cuFile is faster then host read + copy * - * This may not be the optimal threshold for all systems. `is_gds_io_preferred` can use a + * This may not be the optimal threshold for all systems. `is_cufile_io_preferred` can use a * different logic based on the system config. */ static constexpr size_t op_size_threshold = 128 << 10; file_wrapper const file; - cf_file_wrapper const cf_file; + cufile_registered_file const cf_file; }; -class gds_input : public gds_io_base { +/** + * @brief Adapter for the `cuFileRead` API. + * + * Exposes APIs to read directly from a file into device memory. + */ +class cufile_input final : public cufile_io_base { public: - gds_input(std::string const &filepath); + cufile_input(std::string const &filepath); + /** + * @brief Reads into a new device buffer. + */ std::unique_ptr read(size_t offset, size_t size); + /** + * @brief Reads into existing device memory. + * + * Returns the number of bytes read. + */ size_t read(size_t offset, size_t size, uint8_t *dst); }; -class gds_output : public gds_io_base { +/** + * @brief Adapter for the `cuFileWrite` API. + * + * Exposes an API to write directly into a file from device memory. + */ +class cufile_output final : public cufile_io_base { public: - gds_output(std::string const &filepath); + cufile_output(std::string const &filepath); + /** + * @brief Writes the data from a device buffer into a file. + */ void write(void const *data, size_t offset, size_t size); }; From 5807bfda9d2d780d6e071ba24ea8bfad25343449 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Fri, 30 Oct 2020 19:18:42 +0000 Subject: [PATCH 14/52] fix merge --- cpp/CMakeLists.txt | 237 --------------------------------------------- 1 file changed, 237 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f48612f39d5..ab2689e156e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -414,243 +414,6 @@ if(CONDA_LINK_DIRS) link_directories("${CONDA_LINK_DIRS}") endif(CONDA_LINK_DIRS) -################################################################################################### -# - library targets ------------------------------------------------------------------------------- - -add_library(cudf - src/comms/ipc/ipc.cpp - src/merge/merge.cu - src/partitioning/round_robin.cu - src/join/join.cu - src/join/hash_join.cu - src/join/cross_join.cu - src/join/semi_join.cu - src/sort/is_sorted.cu - src/binaryop/binaryop.cpp - src/binaryop/compiled/binary_ops.cu - src/binaryop/jit/code/kernel.cpp - src/binaryop/jit/code/operation.cpp - src/binaryop/jit/code/traits.cpp - src/interop/from_arrow.cpp - src/interop/to_arrow.cpp - src/interop/dlpack.cpp - src/jit/type.cpp - src/jit/parser.cpp - src/jit/cache.cpp - src/jit/launcher.cpp - src/transform/jit/code/kernel.cpp - src/transform/transform.cpp - src/transform/nans_to_nulls.cu - src/transform/bools_to_mask.cu - src/transform/mask_to_bools.cu - src/transform/encode.cu - src/stream_compaction/apply_boolean_mask.cu - src/stream_compaction/drop_nulls.cu - src/stream_compaction/drop_nans.cu - src/stream_compaction/drop_duplicates.cu - src/datetime/datetime_ops.cu - src/hash/hashing.cu - src/partitioning/partitioning.cu - src/quantiles/quantile.cu - src/quantiles/quantiles.cu - src/reductions/reductions.cpp - src/reductions/nth_element.cu - src/reductions/min.cu - src/reductions/max.cu - src/reductions/minmax.cu - src/reductions/any.cu - src/reductions/all.cu - src/reductions/sum.cu - src/reductions/product.cu - src/reductions/sum_of_squares.cu - src/reductions/mean.cu - src/reductions/var.cu - src/reductions/std.cu - src/reductions/scan.cu - src/replace/replace.cu - src/replace/clamp.cu - src/replace/nans.cu - src/replace/nulls.cu - src/reshape/interleave_columns.cu - src/transpose/transpose.cu - src/unary/cast_ops.cu - src/unary/null_ops.cu - src/unary/nan_ops.cu - src/unary/math_ops.cu - src/unary/unary_ops.cuh - src/io/avro/avro_gpu.cu - src/io/avro/avro.cpp - src/io/avro/reader_impl.cu - src/io/csv/csv_gpu.cu - src/io/csv/reader_impl.cu - src/io/csv/writer_impl.cu - src/io/csv/durations.cu - src/io/json/reader_impl.cu - src/io/json/json_gpu.cu - src/io/orc/orc.cpp - src/io/orc/timezone.cpp - src/io/orc/stripe_data.cu - src/io/orc/stripe_init.cu - src/io/orc/stripe_enc.cu - src/io/orc/dict_enc.cu - src/io/orc/stats_enc.cu - src/io/orc/reader_impl.cu - src/io/orc/writer_impl.cu - src/io/parquet/page_data.cu - src/io/parquet/page_hdr.cu - src/io/parquet/page_enc.cu - src/io/parquet/page_dict.cu - src/io/parquet/parquet.cpp - src/io/parquet/reader_impl.cu - src/io/parquet/writer_impl.cu - src/io/comp/cpu_unbz2.cpp - src/io/comp/uncomp.cpp - src/io/comp/brotli_dict.cpp - src/io/comp/debrotli.cu - src/io/comp/snap.cu - src/io/comp/unsnap.cu - src/io/comp/gpuinflate.cu - src/io/functions.cpp - src/io/statistics/column_stats.cu - src/io/utilities/datasource.cpp - src/io/utilities/file_utils.cpp - src/io/utilities/parsing_utils.cu - src/io/utilities/type_conversion.cu - src/io/utilities/data_sink.cpp - src/copying/gather.cu - src/copying/copy.cpp - src/copying/sample.cu - src/copying/scatter.cu - src/copying/shift.cu - src/copying/copy.cu - src/copying/concatenate.cu - src/copying/slice.cpp - src/copying/split.cpp - src/copying/contiguous_split.cu - src/copying/copy_range.cu - src/copying/get_element.cu - src/filling/fill.cu - src/filling/repeat.cu - src/filling/sequence.cu - src/reshape/byte_cast.cu - src/reshape/tile.cu - src/search/search.cu - src/column/column.cu - src/column/column_view.cpp - src/column/column_device_view.cu - src/column/column_factories.cpp - src/table/table_view.cpp - src/table/table_device_view.cu - src/table/table.cpp - src/bitmask/null_mask.cu - src/rolling/rolling.cu - src/rolling/jit/code/kernel.cpp - src/rolling/jit/code/operation.cpp - src/sort/sort.cu - src/sort/stable_sort.cu - src/sort/rank.cu - src/strings/attributes.cu - src/strings/case.cu - src/strings/wrap.cu - src/strings/capitalize.cu - src/strings/char_types/char_types.cu - src/strings/char_types/char_cases.cu - src/strings/combine.cu - src/strings/contains.cu - src/strings/convert/convert_booleans.cu - src/strings/convert/convert_datetime.cu - src/strings/convert/convert_durations.cu - src/strings/convert/convert_floats.cu - src/strings/convert/convert_hex.cu - src/strings/convert/convert_integers.cu - src/strings/convert/convert_ipv4.cu - src/strings/convert/convert_urls.cu - src/strings/copying/concatenate.cu - src/strings/copying/copying.cu - src/strings/extract.cu - src/strings/filter_chars.cu - src/strings/find.cu - src/strings/findall.cu - src/strings/find_multiple.cu - src/strings/filling/fill.cu - src/strings/padding.cu - src/strings/regex/regcomp.cpp - src/strings/regex/regexec.cu - src/strings/replace/replace_re.cu - src/strings/replace/backref_re.cu - src/strings/replace/backref_re_medium.cu - src/strings/replace/backref_re_large.cu - src/strings/replace/multi_re.cu - src/strings/replace/replace.cu - src/strings/sorting/sorting.cu - src/strings/split/partition.cu - src/strings/split/split.cu - src/strings/split/split_record.cu - src/strings/strings_column_factories.cu - src/strings/strings_column_view.cu - src/strings/strings_scalar_factories.cpp - src/strings/strip.cu - src/strings/substring.cu - src/strings/translate.cu - src/strings/utilities.cu - src/lists/extract.cu - src/lists/lists_column_factories.cu - src/lists/lists_column_view.cu - src/lists/copying/concatenate.cu - src/lists/copying/gather.cu - src/lists/copying/copying.cu - src/structs/structs_column_view.cu - src/structs/structs_column_factories.cu - src/text/detokenize.cu - src/text/edit_distance.cu - src/text/generate_ngrams.cu - src/text/normalize.cu - src/text/stemmer.cu - src/text/tokenize.cu - src/text/ngrams_tokenize.cu - src/text/replace.cu - src/text/subword/load_hash_file.cu - src/text/subword/data_normalizer.cu - src/text/subword/wordpiece_tokenizer.cu - src/text/subword/subword_tokenize.cu - src/scalar/scalar.cpp - src/scalar/scalar_factories.cpp - src/dictionary/add_keys.cu - src/dictionary/detail/concatenate.cu - src/dictionary/dictionary_column_view.cpp - src/dictionary/dictionary_factories.cu - src/dictionary/decode.cu - src/dictionary/encode.cu - src/dictionary/remove_keys.cu - src/dictionary/replace.cu - src/dictionary/search.cu - src/dictionary/set_keys.cu - src/groupby/groupby.cu - src/groupby/hash/groupby.cu - src/groupby/sort/groupby.cu - src/groupby/sort/sort_helper.cu - src/groupby/sort/group_sum.cu - src/groupby/sort/group_min.cu - src/groupby/sort/group_max.cu - src/groupby/sort/group_argmax.cu - src/groupby/sort/group_argmin.cu - src/groupby/sort/group_count.cu - src/groupby/sort/group_nunique.cu - src/groupby/sort/group_nth_element.cu - src/groupby/sort/group_std.cu - src/groupby/sort/group_quantiles.cu - src/groupby/sort/group_collect.cu - src/aggregation/aggregation.cpp - src/aggregation/aggregation.cu - src/aggregation/result_cache.cpp - src/ast/transform.cu - src/ast/linearizer.cpp - src/utilities/default_stream.cpp -) - -# Override RPATH for cudf -set_target_properties(cudf PROPERTIES BUILD_RPATH "\$ORIGIN") - ################################################################################################### # - jitify ---------------------------------------------------------------------------------------- From 52845914206376f0259a125380558144464d2375 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Fri, 30 Oct 2020 19:20:40 +0000 Subject: [PATCH 15/52] add missing EOF newlines --- cpp/src/io/utilities/file_utils.cpp | 2 +- cpp/src/io/utilities/file_utils.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 36f69d02f4e..18fb9afd179 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -139,4 +139,4 @@ void cufile_output::write(void const *data, size_t offset, size_t size) } }; // namespace io -}; // namespace cudf \ No newline at end of file +}; // namespace cudf diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index 8f6776ff96d..85093b2edb6 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -127,4 +127,4 @@ class cufile_output final : public cufile_io_base { }; }; // namespace io -}; // namespace cudf \ No newline at end of file +}; // namespace cudf From 7000d55638ce14804e05af1f925d20557db4195d Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 20 Nov 2020 13:21:37 -0800 Subject: [PATCH 16/52] fall back to host read/write if cufile fails --- cpp/src/io/utilities/data_sink.cpp | 10 ++++++---- cpp/src/io/utilities/datasource.cpp | 15 +++++++++------ cpp/src/io/utilities/file_utils.cpp | 18 ++++++++++++++++++ cpp/src/io/utilities/file_utils.hpp | 14 ++++++++++++++ 4 files changed, 47 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 04291bc327e..a5939547c0e 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -28,7 +28,7 @@ namespace io { */ class file_sink : public data_sink { public: - explicit file_sink(std::string const& filepath) : _cufile_out(filepath) + explicit file_sink(std::string const& filepath) : _cufile_out(make_cufile_output(filepath)) { _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file"); @@ -51,19 +51,21 @@ class file_sink : public data_sink { bool is_device_write_preferred(size_t size) const override { - return _cufile_out.is_cufile_io_preferred(size); + return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size); } void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override { - _cufile_out.write(gpu_data, _bytes_written, size); + if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file."); + + _cufile_out->write(gpu_data, _bytes_written, size); _bytes_written += size; } private: std::ofstream _output_stream; size_t _bytes_written = 0; - cufile_output _cufile_out; + std::unique_ptr _cufile_out; }; /** diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 62b305fde7e..abb625317c0 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -46,7 +46,7 @@ class memory_mapped_source : public datasource { public: explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) - : _cufile_in(filepath) + : _cufile_in(make_cufile_input(filepath)) { auto const file = file_wrapper(filepath, O_RDONLY); _file_size = file.size(); @@ -81,23 +81,26 @@ class memory_mapped_source : public datasource { return read_size; } - bool supports_device_read() const override { return true; } + bool supports_device_read() const override { return _cufile_in != nullptr; } bool is_device_read_preferred(size_t size) const { - return _cufile_in.is_cufile_io_preferred(size); + return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size); } std::unique_ptr device_read(size_t offset, size_t size) override { + if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file."); + auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - return _cufile_in.read(offset, size); + return _cufile_in->read(offset, size); } size_t device_read(size_t offset, size_t size, uint8_t *dst) override { + if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file."); auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - return _cufile_in.read(offset, size, dst); + return _cufile_in->read(offset, size, dst); } size_t size() const override { return _file_size; } @@ -130,7 +133,7 @@ class memory_mapped_source : public datasource { void *_map_addr = nullptr; size_t _map_size = 0; size_t _map_offset = 0; - cufile_input _cufile_in; + std::unique_ptr _cufile_in; }; /** diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 18fb9afd179..21cb09872b8 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -138,5 +138,23 @@ void cufile_output::write(void const *data, size_t offset, size_t size) "cuFile error writing to a file"); } +std::unique_ptr make_cufile_input(std::string const &filepath) +{ + try { + return std::make_unique(filepath); + } catch (...) { + return nullptr; + } +} + +std::unique_ptr make_cufile_output(std::string const &filepath) +{ + try { + return std::make_unique(filepath); + } catch (...) { + return nullptr; + } +} + }; // namespace io }; // namespace cudf diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index 85093b2edb6..87154950d19 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -126,5 +126,19 @@ class cufile_output final : public cufile_io_base { void write(void const *data, size_t offset, size_t size); }; +/** + * @brief Creates a `cufile_input` object + * + * Returns a null pointer if an exception occurs in the `cufile_input` constructor. + */ +std::unique_ptr make_cufile_input(std::string const &filepath); + +/** + * @brief Creates a `cufile_output` object + * + * Returns a null pointer if an exception occurs in the `cufile_output` constructor. + */ +std::unique_ptr make_cufile_output(std::string const &filepath); + }; // namespace io }; // namespace cudf From 5896b7e3ffa9e1ea7760bc9156c40bb42de5c383 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 23 Feb 2021 12:23:04 -0800 Subject: [PATCH 17/52] update to 0.19 stuff --- cpp/CMakeLists.txt | 4 ++-- cpp/src/io/utilities/data_sink.cpp | 2 +- cpp/src/io/utilities/datasource.cpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bd20c6e034a..979e8b8313b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -220,11 +220,11 @@ endif(Boost_FOUND) find_path(CUFILE_INCLUDE "cufile.h" HINTS "${GDS_ROOT}/lib" - "/usr/local/gds/lib") + "/usr/local/cuda/lib64") find_library(CUFILE_LIBRARY "libcufile.so" HINTS "${GDS_ROOT}/lib" - "/usr/local/gds/lib") + "/usr/local/cuda/lib64") message(STATUS "CUFILE: CUFILE_LIBRARY set to ${CUFILE_LIBRARY}") message(STATUS "CUFILE: CUFILE_INCLUDE set to ${CUFILE_INCLUDE}") diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index c700a7c54b5..70ebd6993ea 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -55,7 +55,7 @@ class file_sink : public data_sink { return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size); } - void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file."); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 77098f752c0..00ca653cbbe 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -93,14 +93,14 @@ class memory_mapped_source : public datasource { if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file."); auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - return _cufile_in->read(offset, size); + return _cufile_in->read(offset, read_size); } size_t device_read(size_t offset, size_t size, uint8_t *dst) override { if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file."); auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - return _cufile_in->read(offset, size, dst); + return _cufile_in->read(offset, read_size, dst); } size_t size() const override { return _file_size; } From e5071ab8f5203c13c6cf4d3d81d646554a24e6e7 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 23 Feb 2021 15:47:28 -0800 Subject: [PATCH 18/52] don't enforce compatibility mode through config file --- cpp/CMakeLists.txt | 8 +--- cpp/config/cufile.json | 70 ----------------------------- cpp/src/io/utilities/file_utils.cpp | 6 --- 3 files changed, 1 insertion(+), 83 deletions(-) delete mode 100644 cpp/config/cufile.json diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 979e8b8313b..45e3a3c31ff 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -231,9 +231,7 @@ message(STATUS "CUFILE: CUFILE_INCLUDE set to ${CUFILE_INCLUDE}") add_library(cufile SHARED IMPORTED ${CUFILE_LIBRARY}) if (CUFILE_INCLUDE AND CUFILE_LIBRARY) - set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) - configure_file("${CMAKE_CURRENT_SOURCE_DIR}/config/cufile.json" - "${CMAKE_BINARY_DIR}/config/cufile.json") + set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) else() message(FATAL_ERROR "cufile not found, please pass the GDS install directory using -DGDS_ROOT") endif (CUFILE_INCLUDE AND CUFILE_LIBRARY) @@ -641,10 +639,6 @@ install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/cudf DESTINATION include COMPONENT cudf) -install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/config/cufile.json - DESTINATION lib - COMPONENT cudf) - install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/cudf_test DESTINATION include COMPONENT cudf) diff --git a/cpp/config/cufile.json b/cpp/config/cufile.json deleted file mode 100644 index 7ad2d29bfa9..00000000000 --- a/cpp/config/cufile.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "logging": { - // log directory, if not enabled will create log file under current working directory - //"dir": "/home/", - - // ERROR|WARN|INFO|DEBUG|TRACE (in decreasing order of priority) - "level": "ERROR" - }, - - "profile": { - // nvtx profiling on/off - "nvtx": false, - // cufile stats level(0-3) - "cufile_stats": 0 - }, - - "properties": { - // max IO chunk size (4K aligned) used by cuFile per IO request (in KB) - "max_direct_io_size_kb" : 16384, - // device memory size (4K aligned) for reserving bounce buffers for the entire GPU (in KB) - "max_device_cache_size_kb" : 131072, - // limit on maximum memory (4K aligned) that can be pinned for a given process (in KB) - "max_device_pinned_mem_size_kb" : 33554432, - // true or false (true will enable asynchronous io submission to nvidia-fs driver) - "use_poll_mode" : false, - // maximum IO request size (4K aligned) within or equal to which library will poll (in KB) - "poll_mode_max_size_kb": 4, - // allow compat mode, this will enable use of cufile posix read/writes - "allow_compat_mode": true, - // client-side rdma addr list for user-space file-systems(e.g ["10.0.1.0", "10.0.2.0"]) - "rdma_dev_addr_list": [ - "172.16.8.240","172.16.8.241", - "172.16.8.242","172.16.8.243", - "172.16.8.245","172.16.8.246", - "172.16.8.247","172.16.8.248" - ] - }, - - "fs": { - "generic": { - - // for unaligned writes, setting it to true will use posix write instead of cuFileWrite - "posix_unaligned_writes" : false - }, - - "lustre": { - - // IO threshold for read/write (4K aligned)) equal to or below which cufile will use posix reads (KB) - "posix_gds_min_kb" : 0 - } - }, - - "blacklist": { - // specify list of vendor driver modules to blacklist for nvidia-fs (e.g. ["nvme" , "nvme_rdma"]) - "drivers": [ ], - - // specify list of block devices to prevent IO using libcufile (e.g. [ "/dev/sda1" ]) - "devices": [ ], - - // specify list of mount points to prevent IO using libcufile (e.g. ["/mnt/test"]) - "mounts": [ ], - - // specify list of file-systems to prevent IO using libcufile (e.g ["lustre", "wekafs"]) - "filesystems": [ ] - } - - // Application can override custom configuration via export CUFILE_ENV_PATH_JSON= - // e.g : export CUFILE_ENV_PATH_JSON="/home//cufile.json" -} - diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 21cb09872b8..23cae1f9686 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -60,12 +60,6 @@ std::string get_libcudf_dir_path() struct cufile_driver { cufile_driver() { - // Unless CUFILE_ENV_PATH_JSON is already set, set the env var to point to a config file with - // enabled compatiblity mode - auto const cufile_config_path = get_libcudf_dir_path() + "config/cufile.json"; - CUDF_EXPECTS(setenv("CUFILE_ENV_PATH_JSON", cufile_config_path.c_str(), 0) == 0, - "Failed to set the cuFile config file environment variable."); - CUDF_EXPECTS(cuFileDriverOpen().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver"); } ~cufile_driver() { cuFileDriverClose(); } From 10a1eee6dd27e87360f9156ac44f5c7992811a1b Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 23 Feb 2021 18:06:03 -0800 Subject: [PATCH 19/52] add stream parameter to device_read --- cpp/include/cudf/io/datasource.hpp | 18 +++++++++++------- cpp/src/io/parquet/reader_impl.cu | 2 +- cpp/src/io/utilities/datasource.cpp | 26 ++++++++++++++++++-------- cpp/src/io/utilities/file_utils.cpp | 8 +++++--- cpp/src/io/utilities/file_utils.hpp | 12 ++++++++---- 5 files changed, 43 insertions(+), 23 deletions(-) diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index d20176e2abb..ace2010fc6d 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -19,6 +19,8 @@ #include #include +#include + #include #include #include @@ -158,12 +160,14 @@ class datasource { * Data source implementations that don't support direct device reads don't need to override this * function. * - * @param[in] offset Bytes from the start - * @param[in] size Bytes to read + * @param offset Bytes from the start + * @param size Bytes to read * * @return The data buffer in the device memory */ - virtual std::unique_ptr device_read(size_t offset, size_t size) + virtual std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) { CUDF_FAIL("datasource classes that support device_read must override this function."); } @@ -174,13 +178,13 @@ class datasource { * Data source implementations that don't support direct device reads don't need to override this * function. * - * @param[in] offset Bytes from the start - * @param[in] size Bytes to read - * @param[in] dst Address of the existing device memory + * @param offset Bytes from the start + * @param size Bytes to read + * @param dst Address of the existing device memory * * @return The number of bytes read (can be smaller than size) */ - virtual size_t device_read(size_t offset, size_t size, uint8_t* dst) + virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) { CUDF_FAIL("datasource classes that support device_read must override this function."); } diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 8aff9ef700c..2f9b95bbb38 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -852,7 +852,7 @@ void reader::impl::read_column_chunks( if (io_size != 0) { auto &source = _sources[chunk_source_map[chunk]]; if (source->is_device_read_preferred(io_size)) { - page_data[chunk] = source->device_read(io_offset, io_size); + page_data[chunk] = source->device_read(io_offset, io_size, stream); } else { auto const buffer = source->host_read(io_offset, io_size); page_data[chunk] = diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 00ca653cbbe..1786b4e852c 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -88,19 +88,24 @@ class memory_mapped_source : public datasource { return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size); } - std::unique_ptr device_read(size_t offset, size_t size) override + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override { if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file."); auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - return _cufile_in->read(offset, read_size); + return _cufile_in->read(offset, read_size, stream); } - size_t device_read(size_t offset, size_t size, uint8_t *dst) override + size_t device_read(size_t offset, + size_t size, + uint8_t *dst, + rmm::cuda_stream_view stream) override { if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file."); auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - return _cufile_in->read(offset, read_size, dst); + return _cufile_in->read(offset, read_size, dst, stream); } size_t size() const override { return _file_size; } @@ -159,14 +164,19 @@ class user_datasource_wrapper : public datasource { bool supports_device_read() const override { return source->supports_device_read(); } - size_t device_read(size_t offset, size_t size, uint8_t *dst) override + size_t device_read(size_t offset, + size_t size, + uint8_t *dst, + rmm::cuda_stream_view stream) override { - return source->device_read(offset, size, dst); + return source->device_read(offset, size, dst, stream); } - std::unique_ptr device_read(size_t offset, size_t size) override + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override { - return source->device_read(offset, size); + return source->device_read(offset, size, stream); } size_t size() const override { return source->size(); } diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 23cae1f9686..d1511c6d890 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -103,16 +103,18 @@ cufile_input::cufile_input(std::string const &filepath) init_cufile_driver(); } -std::unique_ptr cufile_input::read(size_t offset, size_t size) +std::unique_ptr cufile_input::read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) { - rmm::device_buffer out_data(size); + rmm::device_buffer out_data(size, stream); CUDF_EXPECTS(cuFileRead(cf_file.handle, out_data.data(), size, offset, 0) != -1, "cuFile error reading from a file"); return datasource::buffer::create(std::move(out_data)); } -size_t cufile_input::read(size_t offset, size_t size, uint8_t *dst) +size_t cufile_input::read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) { CUDF_EXPECTS(cuFileRead(cf_file.handle, dst, size, offset, 0) != -1, "cuFile error reading from a file"); diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index 87154950d19..08c6bcfbbdf 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -16,13 +16,15 @@ #pragma once -#include - #include +#include + #include #include +#include + namespace cudf { namespace io { @@ -101,14 +103,16 @@ class cufile_input final : public cufile_io_base { /** * @brief Reads into a new device buffer. */ - std::unique_ptr read(size_t offset, size_t size); + std::unique_ptr read(size_t offset, + size_t size, + rmm::cuda_stream_view stream); /** * @brief Reads into existing device memory. * * Returns the number of bytes read. */ - size_t read(size_t offset, size_t size, uint8_t *dst); + size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream); }; /** From 30149239b81d13b6a0a629bfc0ffa5c032b58404 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Feb 2021 14:01:07 -0800 Subject: [PATCH 20/52] compile-time disable cufile code if the library is not installed --- cpp/CMakeLists.txt | 1 + cpp/src/io/utilities/data_sink.cpp | 2 +- cpp/src/io/utilities/datasource.cpp | 2 +- cpp/src/io/utilities/file_utils.cpp | 44 +++++---- cpp/src/io/utilities/file_utils.hpp | 146 +++++++++++++++++++--------- 5 files changed, 131 insertions(+), 64 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 45e3a3c31ff..f82b72aa65a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -231,6 +231,7 @@ message(STATUS "CUFILE: CUFILE_INCLUDE set to ${CUFILE_INCLUDE}") add_library(cufile SHARED IMPORTED ${CUFILE_LIBRARY}) if (CUFILE_INCLUDE AND CUFILE_LIBRARY) + add_compile_definitions(CUFILE_INSTALLED) set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) else() message(FATAL_ERROR "cufile not found, please pass the GDS install directory using -DGDS_ROOT") diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 70ebd6993ea..d291735083f 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -66,7 +66,7 @@ class file_sink : public data_sink { private: std::ofstream _output_stream; size_t _bytes_written = 0; - std::unique_ptr _cufile_out; + std::unique_ptr _cufile_out; }; /** diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 1786b4e852c..8ca4cfdbe04 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -138,7 +138,7 @@ class memory_mapped_source : public datasource { void *_map_addr = nullptr; size_t _map_size = 0; size_t _map_offset = 0; - std::unique_ptr _cufile_in; + std::unique_ptr _cufile_in; }; /** diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index d1511c6d890..c23a0be5ff2 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -83,13 +83,13 @@ long file_wrapper::size() const } return _size; } - -cufile_registered_file::cufile_registered_file(int fd) +#ifdef CUFILE_INSTALLED +void cufile_registered_file::register_handle() { init_cufile_driver(); CUfileDescr_t cufile_desc{}; - cufile_desc.handle.fd = fd; + cufile_desc.handle.fd = file.desc(); cufile_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; CUDF_EXPECTS(cuFileHandleRegister(&handle, &cufile_desc).err == CU_FILE_SUCCESS, "Cannot register file handle with cuFile"); @@ -97,15 +97,15 @@ cufile_registered_file::cufile_registered_file(int fd) cufile_registered_file::~cufile_registered_file() { cuFileHandleDeregister(handle); } -cufile_input::cufile_input(std::string const &filepath) - : cufile_io_base(filepath, O_RDONLY | O_DIRECT) +cufile_input_impl::cufile_input_impl(std::string const &filepath) + : cf_file(filepath, O_RDONLY | O_DIRECT) { init_cufile_driver(); } -std::unique_ptr cufile_input::read(size_t offset, - size_t size, - rmm::cuda_stream_view stream) +std::unique_ptr cufile_input_impl::read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) { rmm::device_buffer out_data(size, stream); CUDF_EXPECTS(cuFileRead(cf_file.handle, out_data.data(), size, offset, 0) != -1, @@ -114,7 +114,10 @@ std::unique_ptr cufile_input::read(size_t offset, return datasource::buffer::create(std::move(out_data)); } -size_t cufile_input::read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) +size_t cufile_input_impl::read(size_t offset, + size_t size, + uint8_t *dst, + rmm::cuda_stream_view stream) { CUDF_EXPECTS(cuFileRead(cf_file.handle, dst, size, offset, 0) != -1, "cuFile error reading from a file"); @@ -122,34 +125,39 @@ size_t cufile_input::read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_st return size; } -cufile_output::cufile_output(std::string const &filepath) - : cufile_io_base(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) +cufile_output_impl::cufile_output_impl(std::string const &filepath) + : cf_file(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) { init_cufile_driver(); } -void cufile_output::write(void const *data, size_t offset, size_t size) +void cufile_output_impl::write(void const *data, size_t offset, size_t size) { CUDF_EXPECTS(cuFileWrite(cf_file.handle, data, size, offset, 0) != -1, "cuFile error writing to a file"); } +#endif -std::unique_ptr make_cufile_input(std::string const &filepath) +std::unique_ptr make_cufile_input(std::string const &filepath) { +#ifdef CUFILE_INSTALLED try { - return std::make_unique(filepath); + return std::make_unique(filepath); } catch (...) { - return nullptr; } +#endif + return nullptr; } -std::unique_ptr make_cufile_output(std::string const &filepath) +std::unique_ptr make_cufile_output(std::string const &filepath) { +#ifdef CUFILE_INSTALLED try { - return std::make_unique(filepath); + return std::make_unique(filepath); } catch (...) { - return nullptr; } +#endif + return nullptr; } }; // namespace io diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index 08c6bcfbbdf..9c51b1b494b 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -16,7 +16,9 @@ #pragma once +#ifdef CUFILE_INSTALLED #include +#endif #include @@ -43,15 +45,6 @@ class file_wrapper { auto desc() const { return fd; } }; -/** - * @brief Class that provides RAII for cuFile file registration. - */ -struct cufile_registered_file { - CUfileHandle_t handle = nullptr; - explicit cufile_registered_file(int fd); - ~cufile_registered_file(); -}; - /** * @brief Base class for cuFile input/output. * @@ -59,17 +52,6 @@ struct cufile_registered_file { */ class cufile_io_base { public: - cufile_io_base(std::string const &filepath, int flags) - : file(filepath, flags), cf_file{file.desc()} - { - } - cufile_io_base(std::string const &filepath, int flags, mode_t mode) - : file(filepath, flags, mode), cf_file{file.desc()} - { - } - - virtual ~cufile_io_base() = default; - /** * @brief Returns an estimate of whether the cuFile operation is the optimal option. * @@ -87,32 +69,81 @@ class cufile_io_base { * different logic based on the system config. */ static constexpr size_t op_size_threshold = 128 << 10; - file_wrapper const file; - cufile_registered_file const cf_file; }; /** - * @brief Adapter for the `cuFileRead` API. - * - * Exposes APIs to read directly from a file into device memory. + * @brief Interface class for cufile input. */ -class cufile_input final : public cufile_io_base { +class cufile_input : public cufile_io_base { public: - cufile_input(std::string const &filepath); - /** * @brief Reads into a new device buffer. */ - std::unique_ptr read(size_t offset, - size_t size, - rmm::cuda_stream_view stream); + virtual std::unique_ptr read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) = 0; /** * @brief Reads into existing device memory. * * Returns the number of bytes read. */ - size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream); + virtual size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) = 0; +}; + +/** + * @brief Interface class for cufile output. + */ +class cufile_output : public cufile_io_base { + public: + /** + * @brief Writes the data from a device buffer into a file. + */ + virtual void write(void const *data, size_t offset, size_t size) = 0; +}; + +#ifdef CUFILE_INSTALLED +/** + * @brief Class that provides RAII for cuFile file registration. + */ +struct cufile_registered_file { + private: + void register_handle(); + + public: + file_wrapper const file; + CUfileHandle_t handle = nullptr; + cufile_registered_file(std::string const &filepath, int flags) : file(filepath, flags) + { + register_handle(); + } + + cufile_registered_file(std::string const &filepath, int flags, mode_t mode) + : file(filepath, flags, mode) + { + register_handle(); + } + + ~cufile_registered_file(); +}; + +/** + * @brief Adapter for the `cuFileRead` API. + * + * Exposes APIs to read directly from a file into device memory. + */ +class cufile_input_impl final : public cufile_input { + public: + cufile_input_impl(std::string const &filepath); + + std::unique_ptr read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override; + + size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override; + + private: + cufile_registered_file const cf_file; }; /** @@ -120,29 +151,56 @@ class cufile_input final : public cufile_io_base { * * Exposes an API to write directly into a file from device memory. */ -class cufile_output final : public cufile_io_base { +class cufile_output_impl final : public cufile_output { public: - cufile_output(std::string const &filepath); + cufile_output_impl(std::string const &filepath); - /** - * @brief Writes the data from a device buffer into a file. - */ - void write(void const *data, size_t offset, size_t size); + void write(void const *data, size_t offset, size_t size) override; + + private: + cufile_registered_file const cf_file; +}; +#else + +class cufile_input_impl final : public cufile_input { + public: + std::unique_ptr read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override + { + CUDF_FAIL("Only used to compile without cufile library, should not be called"); + } + + size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override + { + CUDF_FAIL("Only used to compile without cufile library, should not be called"); + } +}; + +class cufile_output_impl final : public cufile_output { + public: + void write(void const *data, size_t offset, size_t size) override + { + CUDF_FAIL("Only used to compile without cufile library, should not be called"); + } }; +#endif /** - * @brief Creates a `cufile_input` object + * @brief Creates a `cufile_input_impl` object * - * Returns a null pointer if an exception occurs in the `cufile_input` constructor. + * Returns a null pointer if an exception occurs in the `cufile_input_impl` constructor, or if the + * cuFile library is not installed. */ -std::unique_ptr make_cufile_input(std::string const &filepath); +std::unique_ptr make_cufile_input(std::string const &filepath); /** - * @brief Creates a `cufile_output` object + * @brief Creates a `cufile_output_impl` object * - * Returns a null pointer if an exception occurs in the `cufile_output` constructor. + * Returns a null pointer if an exception occurs in the `cufile_output_impl` constructor, or if the + * cuFile library is not installed. */ -std::unique_ptr make_cufile_output(std::string const &filepath); +std::unique_ptr make_cufile_output(std::string const &filepath); }; // namespace io }; // namespace cudf From f388265b4cf4468db30d595788ed21efe0c1e5a9 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Feb 2021 14:11:18 -0800 Subject: [PATCH 21/52] style fix --- cpp/src/io/parquet/writer_impl.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index b134391ba1f..7b9f9fb12d2 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1118,11 +1118,11 @@ void writer::impl::write(table_view const &table) } auto host_bfr = pinned_buffer{[](size_t size) { - uint8_t *ptr = nullptr; - CUDA_TRY(cudaMallocHost(&ptr, size)); - return ptr; - }(max_chunk_bfr_size), - cudaFreeHost}; + uint8_t *ptr = nullptr; + CUDA_TRY(cudaMallocHost(&ptr, size)); + return ptr; + }(max_chunk_bfr_size), + cudaFreeHost}; // Encode row groups in batches for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size(); From f4337571e8b18eabb6da66bce995ed1641646d29 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Feb 2021 15:13:11 -0800 Subject: [PATCH 22/52] fix building without cufile --- cpp/CMakeLists.txt | 14 +++++++++----- cpp/src/io/utilities/file_utils.cpp | 27 ++++++++++++++------------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f82b72aa65a..c22f3cc3d24 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -233,8 +233,6 @@ add_library(cufile SHARED IMPORTED ${CUFILE_LIBRARY}) if (CUFILE_INCLUDE AND CUFILE_LIBRARY) add_compile_definitions(CUFILE_INSTALLED) set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) -else() - message(FATAL_ERROR "cufile not found, please pass the GDS install directory using -DGDS_ROOT") endif (CUFILE_INCLUDE AND CUFILE_LIBRARY) ################################################################################################### @@ -328,8 +326,10 @@ include_directories("${CMAKE_BINARY_DIR}/include" "${ZLIB_INCLUDE_DIRS}" "${Boost_INCLUDE_DIRS}" "${RMM_INCLUDE}" - "${DLPACK_INCLUDE}" - "${CUFILE_INCLUDE}") + "${DLPACK_INCLUDE}") +if(CUFILE_INSTALLED) + include_directories("${CUFILE_INCLUDE}") +endif() if(CONDA_INCLUDE_DIRS) include_directories("${CONDA_INCLUDE_DIRS}") @@ -543,7 +543,11 @@ function(add_library_module NAMESPACE MODULE MODULE_SOURCE_DIRS) # spdlog level target_compile_definitions("${NAMESPACE}_${MODULE}" PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LOGGING_LEVEL}") add_dependencies("${NAMESPACE}_${MODULE}" stringify_run) - target_link_libraries("${NAMESPACE}_${MODULE}" arrow arrow_cuda nvrtc ${CUDART_LIBRARY} cuda ${ZLIB_LIBRARIES} ${Boost_LIBRARIES} cufile) + target_link_libraries("${NAMESPACE}_${MODULE}" arrow arrow_cuda nvrtc ${CUDART_LIBRARY} cuda ${ZLIB_LIBRARIES} ${Boost_LIBRARIES}) + if(CUFILE_INSTALLED) + target_link_libraries(cufile) + endif() + add_library("${NAMESPACE}::${MODULE}" ALIAS "${NAMESPACE}_${MODULE}") endfunction() diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index c23a0be5ff2..162d8dcd02d 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -51,6 +51,19 @@ std::string get_libcudf_dir_path() return dir_path; } +file_wrapper::~file_wrapper() { close(fd); } + +long file_wrapper::size() const +{ + if (_size < 0) { + struct stat st; + CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); + _size = static_cast(st.st_size); + } + return _size; +} + +#ifdef CUFILE_INSTALLED /** * @brief Class that provides RAII for cuFile driver management. * @@ -68,22 +81,10 @@ struct cufile_driver { /** * @brief Initializes the cuFile driver. * - * Needs to be called before any cuFile operation. + * Should be called before any cuFile operation (no overhead after the first call). */ void init_cufile_driver() { static cufile_driver driver; } -file_wrapper::~file_wrapper() { close(fd); } - -long file_wrapper::size() const -{ - if (_size < 0) { - struct stat st; - CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); - _size = static_cast(st.st_size); - } - return _size; -} -#ifdef CUFILE_INSTALLED void cufile_registered_file::register_handle() { init_cufile_driver(); From fb6ea8b9b6cf299bd854b59442b024c157696b97 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Feb 2021 17:07:51 -0800 Subject: [PATCH 23/52] CMake fix --- cpp/CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c22f3cc3d24..f576f039bb8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -231,6 +231,7 @@ message(STATUS "CUFILE: CUFILE_INCLUDE set to ${CUFILE_INCLUDE}") add_library(cufile SHARED IMPORTED ${CUFILE_LIBRARY}) if (CUFILE_INCLUDE AND CUFILE_LIBRARY) + set(CUFILE_INSTALLED TRUE) add_compile_definitions(CUFILE_INSTALLED) set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) endif (CUFILE_INCLUDE AND CUFILE_LIBRARY) @@ -329,7 +330,7 @@ include_directories("${CMAKE_BINARY_DIR}/include" "${DLPACK_INCLUDE}") if(CUFILE_INSTALLED) include_directories("${CUFILE_INCLUDE}") -endif() +endif(CUFILE_INSTALLED) if(CONDA_INCLUDE_DIRS) include_directories("${CONDA_INCLUDE_DIRS}") @@ -545,8 +546,8 @@ function(add_library_module NAMESPACE MODULE MODULE_SOURCE_DIRS) add_dependencies("${NAMESPACE}_${MODULE}" stringify_run) target_link_libraries("${NAMESPACE}_${MODULE}" arrow arrow_cuda nvrtc ${CUDART_LIBRARY} cuda ${ZLIB_LIBRARIES} ${Boost_LIBRARIES}) if(CUFILE_INSTALLED) - target_link_libraries(cufile) - endif() + target_link_libraries("${NAMESPACE}_${MODULE}" cufile) + endif(CUFILE_INSTALLED) add_library("${NAMESPACE}::${MODULE}" ALIAS "${NAMESPACE}_${MODULE}") endfunction() From ec52e85d69abb569b1401672a2b1bd6ad9a6e456 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Feb 2021 10:00:01 -0800 Subject: [PATCH 24/52] cmake clean up --- cpp/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f576f039bb8..a2db74daacd 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -216,7 +216,7 @@ else() endif(Boost_FOUND) ################################################################################################### -# - find cufile ----------------------------------------------------------------------------------- +# - cuFile ----------------------------------------------------------------------------------- find_path(CUFILE_INCLUDE "cufile.h" HINTS "${GDS_ROOT}/lib" @@ -229,11 +229,11 @@ find_library(CUFILE_LIBRARY "libcufile.so" message(STATUS "CUFILE: CUFILE_LIBRARY set to ${CUFILE_LIBRARY}") message(STATUS "CUFILE: CUFILE_INCLUDE set to ${CUFILE_INCLUDE}") -add_library(cufile SHARED IMPORTED ${CUFILE_LIBRARY}) if (CUFILE_INCLUDE AND CUFILE_LIBRARY) + add_library(cufile SHARED IMPORTED ${CUFILE_LIBRARY}) + set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) set(CUFILE_INSTALLED TRUE) add_compile_definitions(CUFILE_INSTALLED) - set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) endif (CUFILE_INCLUDE AND CUFILE_LIBRARY) ################################################################################################### From d181fec041cca99dcdf6a41e6b72d9d2419ba8fd Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Feb 2021 13:27:56 -0800 Subject: [PATCH 25/52] add missing null check --- cpp/src/io/utilities/data_sink.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index d291735083f..024167b87d9 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -48,7 +48,7 @@ class file_sink : public data_sink { size_t bytes_written() override { return _bytes_written; } - bool supports_device_write() const override { return true; } + bool supports_device_write() const override { return _cufile_out != nullptr; } bool is_device_write_preferred(size_t size) const override { From 1b6070ee57cc75b7bfeae9bc70aeb6aeaaf70267 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Feb 2021 17:03:06 -0800 Subject: [PATCH 26/52] avoid repeated failed attempts to initialize cuFile --- cpp/src/io/utilities/file_utils.cpp | 53 ++++++++++++++++++----------- cpp/src/io/utilities/file_utils.hpp | 24 +++++++++---- 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 162d8dcd02d..57a84093964 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -70,38 +70,51 @@ long file_wrapper::size() const * Should be used as a singleton. Sets the environment path to point to cudf cuFile config file * (enables compatilibity mode). */ -struct cufile_driver { +class cufile_driver { + private: cufile_driver() { + // dlopen CUDF_EXPECTS(cuFileDriverOpen().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver"); + // dlsym for each used API + } + + public: + static auto const *get_instance() + { + static bool first_call = true; + static std::unique_ptr instance; + if (first_call) { + try { + instance = std::unique_ptr(new cufile_driver()); + } catch (...) { + first_call = false; + throw; + } + first_call = false; + } else if (!instance) { + CUDF_FAIL("Failed to initialize cuFile driver"); + } + return instance.get(); } ~cufile_driver() { cuFileDriverClose(); } + // forwards cufile APIs }; -/** - * @brief Initializes the cuFile driver. - * - * Should be called before any cuFile operation (no overhead after the first call). - */ -void init_cufile_driver() { static cufile_driver driver; } - void cufile_registered_file::register_handle() { - init_cufile_driver(); - CUfileDescr_t cufile_desc{}; - cufile_desc.handle.fd = file.desc(); + cufile_desc.handle.fd = _file.desc(); cufile_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; - CUDF_EXPECTS(cuFileHandleRegister(&handle, &cufile_desc).err == CU_FILE_SUCCESS, + CUDF_EXPECTS(cuFileHandleRegister(&_handle, &cufile_desc).err == CU_FILE_SUCCESS, "Cannot register file handle with cuFile"); } -cufile_registered_file::~cufile_registered_file() { cuFileHandleDeregister(handle); } +cufile_registered_file::~cufile_registered_file() { cuFileHandleDeregister(_handle); } cufile_input_impl::cufile_input_impl(std::string const &filepath) - : cf_file(filepath, O_RDONLY | O_DIRECT) + : driver{cufile_driver::get_instance()}, cf_file(driver, filepath, O_RDONLY | O_DIRECT) { - init_cufile_driver(); } std::unique_ptr cufile_input_impl::read(size_t offset, @@ -109,7 +122,7 @@ std::unique_ptr cufile_input_impl::read(size_t offset, rmm::cuda_stream_view stream) { rmm::device_buffer out_data(size, stream); - CUDF_EXPECTS(cuFileRead(cf_file.handle, out_data.data(), size, offset, 0) != -1, + CUDF_EXPECTS(cuFileRead(cf_file.handle(), out_data.data(), size, offset, 0) != -1, "cuFile error reading from a file"); return datasource::buffer::create(std::move(out_data)); @@ -120,21 +133,21 @@ size_t cufile_input_impl::read(size_t offset, uint8_t *dst, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(cuFileRead(cf_file.handle, dst, size, offset, 0) != -1, + CUDF_EXPECTS(cuFileRead(cf_file.handle(), dst, size, offset, 0) != -1, "cuFile error reading from a file"); // have to read the requested size for now return size; } cufile_output_impl::cufile_output_impl(std::string const &filepath) - : cf_file(filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) + : driver{cufile_driver::get_instance()}, + cf_file(driver, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) { - init_cufile_driver(); } void cufile_output_impl::write(void const *data, size_t offset, size_t size) { - CUDF_EXPECTS(cuFileWrite(cf_file.handle, data, size, offset, 0) != -1, + CUDF_EXPECTS(cuFileWrite(cf_file.handle(), data, size, offset, 0) != -1, "cuFile error writing to a file"); } #endif diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index 9c51b1b494b..d898bf71b02 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -103,28 +103,38 @@ class cufile_output : public cufile_io_base { }; #ifdef CUFILE_INSTALLED + +class cufile_driver; /** * @brief Class that provides RAII for cuFile file registration. */ struct cufile_registered_file { - private: void register_handle(); public: - file_wrapper const file; - CUfileHandle_t handle = nullptr; - cufile_registered_file(std::string const &filepath, int flags) : file(filepath, flags) + cufile_registered_file(cufile_driver const *driver, std::string const &filepath, int flags) + : _file(filepath, flags), _driver{driver} { register_handle(); } - cufile_registered_file(std::string const &filepath, int flags, mode_t mode) - : file(filepath, flags, mode) + cufile_registered_file(cufile_driver const *driver, + std::string const &filepath, + int flags, + mode_t mode) + : _file(filepath, flags, mode), _driver{driver} { register_handle(); } + auto handle() const noexcept { return _handle; } + ~cufile_registered_file(); + + private: + file_wrapper const _file; + CUfileHandle_t _handle = nullptr; + cufile_driver const *_driver = nullptr; }; /** @@ -143,6 +153,7 @@ class cufile_input_impl final : public cufile_input { size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override; private: + cufile_driver const *driver; cufile_registered_file const cf_file; }; @@ -158,6 +169,7 @@ class cufile_output_impl final : public cufile_output { void write(void const *data, size_t offset, size_t size) override; private: + cufile_driver const *driver; cufile_registered_file const cf_file; }; #else From 349973c68115dab6b05001422e32a985294850f9 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 26 Feb 2021 12:56:55 -0800 Subject: [PATCH 27/52] link to libcufile at runtime --- cpp/CMakeLists.txt | 22 ++---- cpp/src/io/utilities/file_utils.cpp | 107 ++++++++++++++++++---------- cpp/src/io/utilities/file_utils.hpp | 24 +++---- 3 files changed, 85 insertions(+), 68 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a2db74daacd..5ee414b5c7b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -222,20 +222,11 @@ find_path(CUFILE_INCLUDE "cufile.h" HINTS "${GDS_ROOT}/lib" "/usr/local/cuda/lib64") -find_library(CUFILE_LIBRARY "libcufile.so" - HINTS "${GDS_ROOT}/lib" - "/usr/local/cuda/lib64") - -message(STATUS "CUFILE: CUFILE_LIBRARY set to ${CUFILE_LIBRARY}") message(STATUS "CUFILE: CUFILE_INCLUDE set to ${CUFILE_INCLUDE}") -if (CUFILE_INCLUDE AND CUFILE_LIBRARY) - add_library(cufile SHARED IMPORTED ${CUFILE_LIBRARY}) - set_target_properties(cufile PROPERTIES IMPORTED_LOCATION ${CUFILE_LIBRARY}) - set(CUFILE_INSTALLED TRUE) - add_compile_definitions(CUFILE_INSTALLED) -endif (CUFILE_INCLUDE AND CUFILE_LIBRARY) - +if (CUFILE_INCLUDE) + add_compile_definitions(CUFILE_INCLUDE) +endif (CUFILE_INCLUDE) ################################################################################################### # - RMM ------------------------------------------------------------------------------------------- @@ -328,9 +319,9 @@ include_directories("${CMAKE_BINARY_DIR}/include" "${Boost_INCLUDE_DIRS}" "${RMM_INCLUDE}" "${DLPACK_INCLUDE}") -if(CUFILE_INSTALLED) +if(CUFILE_INCLUDE) include_directories("${CUFILE_INCLUDE}") -endif(CUFILE_INSTALLED) +endif(CUFILE_INCLUDE) if(CONDA_INCLUDE_DIRS) include_directories("${CONDA_INCLUDE_DIRS}") @@ -545,9 +536,6 @@ function(add_library_module NAMESPACE MODULE MODULE_SOURCE_DIRS) target_compile_definitions("${NAMESPACE}_${MODULE}" PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LOGGING_LEVEL}") add_dependencies("${NAMESPACE}_${MODULE}" stringify_run) target_link_libraries("${NAMESPACE}_${MODULE}" arrow arrow_cuda nvrtc ${CUDART_LIBRARY} cuda ${ZLIB_LIBRARIES} ${Boost_LIBRARIES}) - if(CUFILE_INSTALLED) - target_link_libraries("${NAMESPACE}_${MODULE}" cufile) - endif(CUFILE_INSTALLED) add_library("${NAMESPACE}::${MODULE}" ALIAS "${NAMESPACE}_${MODULE}") endfunction() diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 57a84093964..feeaca2d6b4 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -63,57 +63,87 @@ long file_wrapper::size() const return _size; } -#ifdef CUFILE_INSTALLED +#ifdef CUFILE_INCLUDE /** - * @brief Class that provides RAII for cuFile driver management. - * - * Should be used as a singleton. Sets the environment path to point to cudf cuFile config file - * (enables compatilibity mode). + * @brief Class that dynamically loads the cuFile library and manages the cuFile driver. */ -class cufile_driver { +class cufile_shim { private: - cufile_driver() - { - // dlopen - CUDF_EXPECTS(cuFileDriverOpen().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver"); - // dlsym for each used API - } + cufile_shim(); public: - static auto const *get_instance() - { - static bool first_call = true; - static std::unique_ptr instance; - if (first_call) { - try { - instance = std::unique_ptr(new cufile_driver()); - } catch (...) { - first_call = false; - throw; - } + cufile_shim(cufile_shim const &) = delete; + cufile_shim &operator=(cufile_shim const &) = delete; + + static auto const *get_instance(); + + void *cf_lib = nullptr; + decltype(cuFileDriverOpen) *driver_open = nullptr; + decltype(cuFileDriverClose) *driver_close = nullptr; + decltype(cuFileHandleRegister) *handle_register = nullptr; + decltype(cuFileHandleDeregister) *handle_deregister = nullptr; + decltype(cuFileRead) *read = nullptr; + decltype(cuFileWrite) *write = nullptr; + + ~cufile_shim() + { // try-catch? + driver_close(); + dlclose(cf_lib); + } +}; + +cufile_shim::cufile_shim() +{ + cf_lib = dlopen("libcufile.so", RTLD_NOW); + driver_open = reinterpret_cast(dlsym(cf_lib, "cuFileDriverOpen")); + CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile symbol"); + driver_close = reinterpret_cast(dlsym(cf_lib, "cuFileDriverClose")); + CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile symbol"); + handle_register = + reinterpret_cast(dlsym(cf_lib, "cuFileHandleRegister")); + CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile symbol"); + handle_deregister = + reinterpret_cast(dlsym(cf_lib, "cuFileHandleDeregister")); + CUDF_EXPECTS(handle_deregister != nullptr, "could not find cuFile symbol"); + read = reinterpret_cast(dlsym(cf_lib, "cuFileRead")); + CUDF_EXPECTS(read != nullptr, "could not find cuFile symbol"); + write = reinterpret_cast(dlsym(cf_lib, "cuFileWrite")); + CUDF_EXPECTS(write != nullptr, "could not find cuFile symbol"); + + CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver"); +} + +auto const *cufile_shim::get_instance() +{ + static bool first_call = true; + static std::unique_ptr instance; + if (first_call) { + try { + instance = std::unique_ptr(new cufile_shim()); + } catch (...) { first_call = false; - } else if (!instance) { - CUDF_FAIL("Failed to initialize cuFile driver"); + throw; } - return instance.get(); + first_call = false; + } else if (!instance) { + CUDF_FAIL("Failed to initialize cuFile driver"); } - ~cufile_driver() { cuFileDriverClose(); } - // forwards cufile APIs -}; + return instance.get(); +} void cufile_registered_file::register_handle() { CUfileDescr_t cufile_desc{}; cufile_desc.handle.fd = _file.desc(); cufile_desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; - CUDF_EXPECTS(cuFileHandleRegister(&_handle, &cufile_desc).err == CU_FILE_SUCCESS, + CUDF_EXPECTS(shim->handle_register(&cf_handle, &cufile_desc).err == CU_FILE_SUCCESS, "Cannot register file handle with cuFile"); } -cufile_registered_file::~cufile_registered_file() { cuFileHandleDeregister(_handle); } +cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_handle); } cufile_input_impl::cufile_input_impl(std::string const &filepath) - : driver{cufile_driver::get_instance()}, cf_file(driver, filepath, O_RDONLY | O_DIRECT) + : shim{cufile_shim::get_instance()}, cf_file(shim, filepath, O_RDONLY | O_DIRECT) { } @@ -122,7 +152,7 @@ std::unique_ptr cufile_input_impl::read(size_t offset, rmm::cuda_stream_view stream) { rmm::device_buffer out_data(size, stream); - CUDF_EXPECTS(cuFileRead(cf_file.handle(), out_data.data(), size, offset, 0) != -1, + CUDF_EXPECTS(shim->read(cf_file.handle(), out_data.data(), size, offset, 0) != -1, "cuFile error reading from a file"); return datasource::buffer::create(std::move(out_data)); @@ -133,28 +163,27 @@ size_t cufile_input_impl::read(size_t offset, uint8_t *dst, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(cuFileRead(cf_file.handle(), dst, size, offset, 0) != -1, + CUDF_EXPECTS(shim->read(cf_file.handle(), dst, size, offset, 0) != -1, "cuFile error reading from a file"); // have to read the requested size for now return size; } cufile_output_impl::cufile_output_impl(std::string const &filepath) - : driver{cufile_driver::get_instance()}, - cf_file(driver, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) + : shim{cufile_shim::get_instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) { } void cufile_output_impl::write(void const *data, size_t offset, size_t size) { - CUDF_EXPECTS(cuFileWrite(cf_file.handle(), data, size, offset, 0) != -1, + CUDF_EXPECTS(shim->write(cf_file.handle(), data, size, offset, 0) != -1, "cuFile error writing to a file"); } #endif std::unique_ptr make_cufile_input(std::string const &filepath) { -#ifdef CUFILE_INSTALLED +#ifdef CUFILE_INCLUDE try { return std::make_unique(filepath); } catch (...) { @@ -165,7 +194,7 @@ std::unique_ptr make_cufile_input(std::string const &filepath std::unique_ptr make_cufile_output(std::string const &filepath) { -#ifdef CUFILE_INSTALLED +#ifdef CUFILE_INCLUDE try { return std::make_unique(filepath); } catch (...) { diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index d898bf71b02..7db98ce6514 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -16,7 +16,7 @@ #pragma once -#ifdef CUFILE_INSTALLED +#ifdef CUFILE_INCLUDE #include #endif @@ -102,9 +102,9 @@ class cufile_output : public cufile_io_base { virtual void write(void const *data, size_t offset, size_t size) = 0; }; -#ifdef CUFILE_INSTALLED +#ifdef CUFILE_INCLUDE -class cufile_driver; +class cufile_shim; /** * @brief Class that provides RAII for cuFile file registration. */ @@ -112,29 +112,29 @@ struct cufile_registered_file { void register_handle(); public: - cufile_registered_file(cufile_driver const *driver, std::string const &filepath, int flags) - : _file(filepath, flags), _driver{driver} + cufile_registered_file(cufile_shim const *shim, std::string const &filepath, int flags) + : _file(filepath, flags), shim{shim} { register_handle(); } - cufile_registered_file(cufile_driver const *driver, + cufile_registered_file(cufile_shim const *shim, std::string const &filepath, int flags, mode_t mode) - : _file(filepath, flags, mode), _driver{driver} + : _file(filepath, flags, mode), shim{shim} { register_handle(); } - auto handle() const noexcept { return _handle; } + auto const &handle() const noexcept { return cf_handle; } ~cufile_registered_file(); private: file_wrapper const _file; - CUfileHandle_t _handle = nullptr; - cufile_driver const *_driver = nullptr; + CUfileHandle_t cf_handle = nullptr; + cufile_shim const *shim = nullptr; }; /** @@ -153,7 +153,7 @@ class cufile_input_impl final : public cufile_input { size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override; private: - cufile_driver const *driver; + cufile_shim const *shim = nullptr; cufile_registered_file const cf_file; }; @@ -169,7 +169,7 @@ class cufile_output_impl final : public cufile_output { void write(void const *data, size_t offset, size_t size) override; private: - cufile_driver const *driver; + cufile_shim const *shim = nullptr; cufile_registered_file const cf_file; }; #else From 35609495d49ca1ea576761a40d5ef1368e019174 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 1 Mar 2021 17:34:27 -0800 Subject: [PATCH 28/52] remove newline Co-authored-by: Keith Kraus --- cpp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5ee414b5c7b..38dec5165e2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -536,7 +536,6 @@ function(add_library_module NAMESPACE MODULE MODULE_SOURCE_DIRS) target_compile_definitions("${NAMESPACE}_${MODULE}" PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LOGGING_LEVEL}") add_dependencies("${NAMESPACE}_${MODULE}" stringify_run) target_link_libraries("${NAMESPACE}_${MODULE}" arrow arrow_cuda nvrtc ${CUDART_LIBRARY} cuda ${ZLIB_LIBRARIES} ${Boost_LIBRARIES}) - add_library("${NAMESPACE}::${MODULE}" ALIAS "${NAMESPACE}_${MODULE}") endfunction() From 378023b96f98d0bda59c41bc161f05ce541c7006 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 2 Mar 2021 09:59:33 -0800 Subject: [PATCH 29/52] add file path to error messages --- cpp/src/io/utilities/file_utils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index feeaca2d6b4..6c487533ac6 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -30,13 +30,13 @@ namespace io { file_wrapper::file_wrapper(std::string const &filepath, int flags) : fd(open(filepath.c_str(), flags)) { - CUDF_EXPECTS(fd != -1, "Cannot open file"); + CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath); } file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) : fd(open(filepath.c_str(), flags, mode)) { - CUDF_EXPECTS(fd != -1, "Cannot open file"); + CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath); } /** From 5ed479b75cc7d54f6b61b0eb4d4b48188653d65c Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 2 Mar 2021 10:58:11 -0800 Subject: [PATCH 30/52] use FindcuFile --- cpp/CMakeLists.txt | 4 ++-- cpp/src/io/utilities/file_utils.cpp | 6 +++--- cpp/src/io/utilities/file_utils.hpp | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index df30187d8f3..42cf484a57d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -428,9 +428,9 @@ endif() # Add cuFile include paths if available if(cuFile_FOUND) - target_include_directories(cudf PUBLIC ${cuFile_INCLUDE_DIRS}) + target_include_directories(cudf PUBLIC "${cuFile_INCLUDE_DIRS}") target_compile_options(cudf PUBLIC ${cuFile_COMPILE_OPTIONS}) - target_compile_definitions(cudf PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM) + target_compile_definitions(cudf PUBLIC CUFILE_FOUND) endif() # Instruct jitify to use the kernel JIT cache diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 6c487533ac6..2029fef9316 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -63,7 +63,7 @@ long file_wrapper::size() const return _size; } -#ifdef CUFILE_INCLUDE +#ifdef CUFILE_FOUND /** * @brief Class that dynamically loads the cuFile library and manages the cuFile driver. */ @@ -183,7 +183,7 @@ void cufile_output_impl::write(void const *data, size_t offset, size_t size) std::unique_ptr make_cufile_input(std::string const &filepath) { -#ifdef CUFILE_INCLUDE +#ifdef CUFILE_FOUND try { return std::make_unique(filepath); } catch (...) { @@ -194,7 +194,7 @@ std::unique_ptr make_cufile_input(std::string const &filepath std::unique_ptr make_cufile_output(std::string const &filepath) { -#ifdef CUFILE_INCLUDE +#ifdef CUFILE_FOUND try { return std::make_unique(filepath); } catch (...) { diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_utils.hpp index 7db98ce6514..683dd6320f3 100644 --- a/cpp/src/io/utilities/file_utils.hpp +++ b/cpp/src/io/utilities/file_utils.hpp @@ -16,7 +16,7 @@ #pragma once -#ifdef CUFILE_INCLUDE +#ifdef CUFILE_FOUND #include #endif @@ -102,7 +102,7 @@ class cufile_output : public cufile_io_base { virtual void write(void const *data, size_t offset, size_t size) = 0; }; -#ifdef CUFILE_INCLUDE +#ifdef CUFILE_FOUND class cufile_shim; /** From 1e2727e22079dcc821eebc6dcf0cd0554fa1a8ac Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 2 Mar 2021 11:46:55 -0800 Subject: [PATCH 31/52] add new file to cmake list --- cpp/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 42cf484a57d..2a112be6d33 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -238,6 +238,7 @@ add_library(cudf src/io/statistics/column_stats.cu src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp + src/io/utilities/file_utils.cpp src/io/utilities/parsing_utils.cu src/io/utilities/type_conversion.cpp src/jit/cache.cpp From a2f4019135841ca70e9cb29bbd414703d9dcd15b Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 2 Mar 2021 16:25:15 -0800 Subject: [PATCH 32/52] avoid using unique_ptr to store the singleton --- cpp/src/io/utilities/file_utils.cpp | 79 ++++++++++++++--------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 2029fef9316..5f9e941804f 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -71,11 +71,19 @@ class cufile_shim { private: cufile_shim(); + std::unique_ptr init_error; + public: cufile_shim(cufile_shim const &) = delete; cufile_shim &operator=(cufile_shim const &) = delete; - static auto const *get_instance(); + static cufile_shim const *instance(); + + ~cufile_shim() + { + driver_close(); + dlclose(cf_lib); + } void *cf_lib = nullptr; decltype(cuFileDriverOpen) *driver_open = nullptr; @@ -84,51 +92,40 @@ class cufile_shim { decltype(cuFileHandleDeregister) *handle_deregister = nullptr; decltype(cuFileRead) *read = nullptr; decltype(cuFileWrite) *write = nullptr; - - ~cufile_shim() - { // try-catch? - driver_close(); - dlclose(cf_lib); - } }; cufile_shim::cufile_shim() { - cf_lib = dlopen("libcufile.so", RTLD_NOW); - driver_open = reinterpret_cast(dlsym(cf_lib, "cuFileDriverOpen")); - CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile symbol"); - driver_close = reinterpret_cast(dlsym(cf_lib, "cuFileDriverClose")); - CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile symbol"); - handle_register = - reinterpret_cast(dlsym(cf_lib, "cuFileHandleRegister")); - CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile symbol"); - handle_deregister = - reinterpret_cast(dlsym(cf_lib, "cuFileHandleDeregister")); - CUDF_EXPECTS(handle_deregister != nullptr, "could not find cuFile symbol"); - read = reinterpret_cast(dlsym(cf_lib, "cuFileRead")); - CUDF_EXPECTS(read != nullptr, "could not find cuFile symbol"); - write = reinterpret_cast(dlsym(cf_lib, "cuFileWrite")); - CUDF_EXPECTS(write != nullptr, "could not find cuFile symbol"); - - CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver"); + try { + cf_lib = dlopen("libcufile.so", RTLD_NOW); + driver_open = reinterpret_cast(dlsym(cf_lib, "cuFileDriverOpen")); + CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol"); + driver_close = reinterpret_cast(dlsym(cf_lib, "cuFileDriverClose")); + CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol"); + handle_register = + reinterpret_cast(dlsym(cf_lib, "cuFileHandleRegister")); + CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol"); + handle_deregister = + reinterpret_cast(dlsym(cf_lib, "cuFileHandleDeregister")); + CUDF_EXPECTS(handle_deregister != nullptr, + "could not find cuFile cuFileHandleDeregister symbol"); + read = reinterpret_cast(dlsym(cf_lib, "cuFileRead")); + CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol"); + write = reinterpret_cast(dlsym(cf_lib, "cuFileWrite")); + CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol"); + + CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver"); + } catch (cudf::logic_error const &err) { + init_error = std::make_unique(err); + } } -auto const *cufile_shim::get_instance() +cufile_shim const *cufile_shim::instance() { - static bool first_call = true; - static std::unique_ptr instance; - if (first_call) { - try { - instance = std::unique_ptr(new cufile_shim()); - } catch (...) { - first_call = false; - throw; - } - first_call = false; - } else if (!instance) { - CUDF_FAIL("Failed to initialize cuFile driver"); - } - return instance.get(); + static cufile_shim _instance; + // Defer throwing to avoid repeated attempts to load the library + if (_instance.init_error) CUDF_FAIL("" + std::string(_instance.init_error->what())); + return &_instance; } void cufile_registered_file::register_handle() @@ -143,7 +140,7 @@ void cufile_registered_file::register_handle() cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_handle); } cufile_input_impl::cufile_input_impl(std::string const &filepath) - : shim{cufile_shim::get_instance()}, cf_file(shim, filepath, O_RDONLY | O_DIRECT) + : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_RDONLY | O_DIRECT) { } @@ -170,7 +167,7 @@ size_t cufile_input_impl::read(size_t offset, } cufile_output_impl::cufile_output_impl(std::string const &filepath) - : shim{cufile_shim::get_instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) + : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) { } From fb94b56ceb11f8d0d5156690f685798293e32b99 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 2 Mar 2021 17:14:11 -0800 Subject: [PATCH 33/52] add is_valid to shim --- cpp/src/io/utilities/file_utils.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 5f9e941804f..dd7d5729d26 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -72,6 +72,7 @@ class cufile_shim { cufile_shim(); std::unique_ptr init_error; + auto is_valid() const noexcept { return init_error != nullptr; } public: cufile_shim(cufile_shim const &) = delete; @@ -124,7 +125,7 @@ cufile_shim const *cufile_shim::instance() { static cufile_shim _instance; // Defer throwing to avoid repeated attempts to load the library - if (_instance.init_error) CUDF_FAIL("" + std::string(_instance.init_error->what())); + if (!_instance.is_valid()) CUDF_FAIL("" + std::string(_instance.init_error->what())); return &_instance; } From b2771ad88c78dcbaa47d5a17fafe00d0234cb4db Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 2 Mar 2021 22:10:04 -0800 Subject: [PATCH 34/52] add config class --- cpp/src/io/utilities/file_utils.cpp | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index dd7d5729d26..6e44e3ec0a3 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -64,6 +64,25 @@ long file_wrapper::size() const } #ifdef CUFILE_FOUND + +class cufile_config { + bool enabled = true; + + cufile_config() + { + // TODO read env var + } + + public: + bool is_enabled() const { return enabled; } + + static cufile_config const *instance() + { + static cufile_config _instance; + return &_instance; + } +}; + /** * @brief Class that dynamically loads the cuFile library and manages the cuFile driver. */ @@ -72,7 +91,7 @@ class cufile_shim { cufile_shim(); std::unique_ptr init_error; - auto is_valid() const noexcept { return init_error != nullptr; } + auto is_valid() const noexcept { return init_error == nullptr; } public: cufile_shim(cufile_shim const &) = delete; @@ -126,6 +145,7 @@ cufile_shim const *cufile_shim::instance() static cufile_shim _instance; // Defer throwing to avoid repeated attempts to load the library if (!_instance.is_valid()) CUDF_FAIL("" + std::string(_instance.init_error->what())); + return &_instance; } @@ -182,9 +202,8 @@ void cufile_output_impl::write(void const *data, size_t offset, size_t size) std::unique_ptr make_cufile_input(std::string const &filepath) { #ifdef CUFILE_FOUND - try { + if (cufile_config::instance()->is_enabled()) { return std::make_unique(filepath); - } catch (...) { } #endif return nullptr; @@ -193,9 +212,8 @@ std::unique_ptr make_cufile_input(std::string const &filepath std::unique_ptr make_cufile_output(std::string const &filepath) { #ifdef CUFILE_FOUND - try { + if (cufile_config::instance()->is_enabled()) { return std::make_unique(filepath); - } catch (...) { } #endif return nullptr; From 3077bd49accd6b5d715dff00591bf87a7effe6f4 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 2 Mar 2021 23:07:34 -0800 Subject: [PATCH 35/52] read env var to enable GDS --- cpp/src/io/utilities/file_utils.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index 6e44e3ec0a3..ddfefda208d 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -66,11 +66,17 @@ long file_wrapper::size() const #ifdef CUFILE_FOUND class cufile_config { - bool enabled = true; + bool enabled = false; cufile_config() { - // TODO read env var + auto const policy = std::getenv("LIBCUDF_CUFILE_POLICY"); + if (policy == nullptr) { + enabled = false; + } else { + auto const policy_string = std::string(policy); + enabled = (policy_string == "ALWAYS" || policy_string == "GDS"); + } } public: From 9d350dba88227533c56c747059e0bedbb7c064a1 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 3 Mar 2021 14:09:48 -0800 Subject: [PATCH 36/52] rename file_util.hpp --- cpp/src/io/utilities/data_sink.cpp | 2 +- cpp/src/io/utilities/datasource.cpp | 2 +- cpp/src/io/utilities/{file_utils.hpp => file_io_utilities.hpp} | 0 cpp/src/io/utilities/file_utils.cpp | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename cpp/src/io/utilities/{file_utils.hpp => file_io_utilities.hpp} (100%) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 024167b87d9..4f1cf4030c0 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 8ca4cfdbe04..838e2dd913e 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include namespace cudf { namespace io { diff --git a/cpp/src/io/utilities/file_utils.hpp b/cpp/src/io/utilities/file_io_utilities.hpp similarity index 100% rename from cpp/src/io/utilities/file_utils.hpp rename to cpp/src/io/utilities/file_io_utilities.hpp diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_utils.cpp index ddfefda208d..68ddc9273e2 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_utils.cpp @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include #include #include From 09730f4d8055e116710e28d21afe055cecfabb03 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 3 Mar 2021 22:55:27 -0800 Subject: [PATCH 37/52] control compatiblity mode --- cpp/CMakeLists.txt | 2 +- .../{file_utils.cpp => file_io_utilities.cpp} | 42 ++++++++++++++++--- 2 files changed, 37 insertions(+), 7 deletions(-) rename cpp/src/io/utilities/{file_utils.cpp => file_io_utilities.cpp} (82%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2a112be6d33..024c3ef2f75 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -238,7 +238,7 @@ add_library(cudf src/io/statistics/column_stats.cu src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp - src/io/utilities/file_utils.cpp + src/io/utilities/file_io_utilities.cpp src/io/utilities/parsing_utils.cu src/io/utilities/type_conversion.cpp src/jit/cache.cpp diff --git a/cpp/src/io/utilities/file_utils.cpp b/cpp/src/io/utilities/file_io_utilities.cpp similarity index 82% rename from cpp/src/io/utilities/file_utils.cpp rename to cpp/src/io/utilities/file_io_utilities.cpp index 68ddc9273e2..e88a082bcd2 100644 --- a/cpp/src/io/utilities/file_utils.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include @@ -24,6 +25,8 @@ #include +#include + namespace cudf { namespace io { @@ -66,16 +69,43 @@ long file_wrapper::size() const #ifdef CUFILE_FOUND class cufile_config { + std::string const default_policy = "OFF"; + std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON"; + bool enabled = false; + temp_directory tmp_config_dir{"cudf_cufile_config"}; + + std::string getenv_or(std::string const &env_var_name, std::string const &default_val) + { + auto const env_val = std::getenv(env_var_name.c_str()); + return (env_val == nullptr) ? default_val : std::string(env_val); + } cufile_config() { - auto const policy = std::getenv("LIBCUDF_CUFILE_POLICY"); - if (policy == nullptr) { - enabled = false; - } else { - auto const policy_string = std::string(policy); - enabled = (policy_string == "ALWAYS" || policy_string == "GDS"); + auto const policy = getenv_or("LIBCUDF_CUFILE_POLICY", default_policy); + + enabled = (policy == "ALWAYS" || policy == "GDS"); + + if (enabled) { + auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json"); + std::ifstream user_config(config_file_path); + auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json"; + std::ofstream cudf_config_file(cudf_config_path); + + std::string line; + while (std::getline(user_config, line)) { + std::string const tag = "\"allow_compat_mode\""; + if (line.find(tag) != std::string::npos) { + // TODO: only replace the true/false value + cudf_config_file << tag << ": " << ((policy == "ALWAYS") ? "true" : "false") << ",\n"; + } else { + cudf_config_file << line << '\n'; + } + + CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0, + "Failed to set the cuFile config file environment variable."); + } } } From d5c67f8af4a20cd866043d851a2b00a38d1b45f9 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 3 Mar 2021 23:15:26 -0800 Subject: [PATCH 38/52] style fix --- cpp/src/io/utilities/file_io_utilities.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index e88a082bcd2..21ff41c8ec4 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -69,7 +69,7 @@ long file_wrapper::size() const #ifdef CUFILE_FOUND class cufile_config { - std::string const default_policy = "OFF"; + std::string const default_policy = "OFF"; std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON"; bool enabled = false; From 8ac417e878d0495bd2e5baa64eb9fe87152f3697 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 4 Mar 2021 11:16:31 -0800 Subject: [PATCH 39/52] update device_write condition in CSV writer --- cpp/src/io/csv/writer_impl.cu | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index dda2e0704f6..cd35ea15ac0 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -416,36 +416,28 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view, auto total_num_bytes = strings_column.chars_size(); char const* ptr_all_bytes = strings_column.chars().data(); - if (out_sink_->supports_device_write()) { - // host algorithm call, but the underlying call - // is a device_write taking a device buffer; - // + if (out_sink_->is_device_write_preferred(total_num_bytes)) { + // Direct write from device memory out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream); - out_sink_->device_write(newline.data(), - newline.size(), - stream); // needs newline at the end, to separate from next chunk } else { - // no device write possible; - // - // copy the bytes to host, too: - // + // copy the bytes to host to write them out thrust::host_vector h_bytes(total_num_bytes); CUDA_TRY(cudaMemcpyAsync(h_bytes.data(), ptr_all_bytes, total_num_bytes * sizeof(char), cudaMemcpyDeviceToHost, stream.value())); - stream.synchronize(); - // host algorithm call, where the underlying call - // is also host_write taking a host buffer; - // - char const* ptr_h_bytes = h_bytes.data(); - out_sink_->host_write(ptr_h_bytes, total_num_bytes); + out_sink_->host_write(h_bytes.data(), total_num_bytes); + } + + // Needs newline at the end, to separate from next chunk + if (out_sink_->is_device_write_preferred(newline.size())) { + out_sink_->device_write(newline.data(), newline.size(), stream); + } else { out_sink_->host_write(options_.get_line_terminator().data(), - options_.get_line_terminator() - .size()); // needs newline at the end, to separate from next chunk + options_.get_line_terminator().size()); } } From 273962ceebf9d67f25dc314941e16aff087211e9 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 4 Mar 2021 12:49:20 -0800 Subject: [PATCH 40/52] fall back to host if cufile io initialization fails --- cpp/src/io/utilities/file_io_utilities.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 21ff41c8ec4..0ffb6ae27b0 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -89,12 +89,12 @@ class cufile_config { if (enabled) { auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json"); - std::ifstream user_config(config_file_path); + std::ifstream user_config_file(config_file_path); auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json"; std::ofstream cudf_config_file(cudf_config_path); std::string line; - while (std::getline(user_config, line)) { + while (std::getline(user_config_file, line)) { std::string const tag = "\"allow_compat_mode\""; if (line.find(tag) != std::string::npos) { // TODO: only replace the true/false value @@ -239,7 +239,10 @@ std::unique_ptr make_cufile_input(std::string const &filepath { #ifdef CUFILE_FOUND if (cufile_config::instance()->is_enabled()) { - return std::make_unique(filepath); + try { + return std::make_unique(filepath); + } catch (...) { + } } #endif return nullptr; @@ -249,7 +252,10 @@ std::unique_ptr make_cufile_output(std::string const &filepa { #ifdef CUFILE_FOUND if (cufile_config::instance()->is_enabled()) { - return std::make_unique(filepath); + try { + return std::make_unique(filepath); + } catch (...) { + } } #endif return nullptr; From 071c6198bd0250d2350aec49fb3fc0fdbf4be75a Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 4 Mar 2021 15:17:15 -0800 Subject: [PATCH 41/52] docs and such --- cpp/include/cudf/io/data_sink.hpp | 22 +++++++++---- cpp/include/cudf/io/datasource.hpp | 50 ++++++++++++++++++++++++------ cpp/src/io/parquet/reader_impl.cu | 2 +- 3 files changed, 57 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index 5475c9ace8d..36792f7ca35 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -107,25 +107,35 @@ class data_sink { */ virtual bool supports_device_write() const { return false; } + /** + * @brief Estimates whether a direct device write would be more optimal for the given size. + * + * @param size Number of bytes to write + * @return whether the device write is expected to be more performant for the given size + */ virtual bool is_device_write_preferred(size_t size) const { return supports_device_write(); } /** * @brief Append the buffer content to the sink from a gpu address * - * @param[in] data Pointer to the buffer to be written into the sink object - * @param[in] size Number of bytes to write + * For optimal performance, should only be called when `is_device_write_preferred` returns `true`. + * Data sink implementations that don't support direct device writes don't need to override + * this function. * - * @return void + * @throws cudf::logic_error the object does not support direct device writes, i.e. + * `supports_device_write` returns `false`. + * + * @param data Pointer to the buffer to be written into the sink object + * @param size Number of bytes to write + * @param stream CUDA stream to use, default `rmm::cuda_stream_default` */ virtual void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) { - CUDF_FAIL("data_sink classes that support device_write must override this function."); + CUDF_FAIL("data_sink classes that support device_write must override it."); } /** * @brief Flush the data written into the sink - * - * @return void */ virtual void flush() = 0; diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index ace2010fc6d..4f9fb741110 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -52,7 +52,7 @@ class datasource { /** * @brief Returns the address of the data in the buffer. */ - virtual const uint8_t* data() const = 0; + virtual uint8_t const* data() const = 0; /** * @brief Base class destructor @@ -152,16 +152,27 @@ class datasource { */ virtual bool supports_device_read() const { return false; } + /** + * @brief Estimates whether a direct device read would be more optimal for the given size. + * + * @param size Number of bytes to read + * @return whether the device read is expected to be more performant for the given size + */ virtual bool is_device_read_preferred(size_t size) const { return supports_device_read(); } /** * @brief Returns a device buffer with a subset of data from the source. * + * For optimal performance, should only be called when `is_device_read_preferred` returns `true`. * Data source implementations that don't support direct device reads don't need to override this * function. * + * @throws cudf::logic_error the object does not support direct device reads, i.e. + * `supports_device_read` returns `false`. + * * @param offset Bytes from the start * @param size Bytes to read + * @param stream CUDA stream to use, default `rmm::cuda_stream_default` * * @return The data buffer in the device memory */ @@ -169,24 +180,29 @@ class datasource { size_t size, rmm::cuda_stream_view stream) { - CUDF_FAIL("datasource classes that support device_read must override this function."); + CUDF_FAIL("datasource classes that support device_read must override it."); } /** * @brief Reads a selected range into a preallocated device buffer * + * For optimal performance, should only be called when `is_device_read_preferred` returns `true`. * Data source implementations that don't support direct device reads don't need to override this * function. * + * @throws cudf::logic_error the object does not support direct device reads, i.e. + * `supports_device_read` returns `false`. + * * @param offset Bytes from the start * @param size Bytes to read * @param dst Address of the existing device memory + * @param stream CUDA stream to use, default `rmm::cuda_stream_default` * * @return The number of bytes read (can be smaller than size) */ virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) { - CUDF_FAIL("datasource classes that support device_read must override this function."); + CUDF_FAIL("datasource classes that support device_read must override it."); } /** @@ -214,34 +230,48 @@ class datasource { size_t size() const override { return _size; } - const uint8_t* data() const override { return _data; } + uint8_t const* data() const override { return _data; } private: uint8_t* const _data; size_t const _size; }; + /** + * @brief Derived implementation of `buffer` that owns the data. + * + * Can use different container types to hold the data buffer. + * + * @tparam Container Type of the container object that owns the data + */ template class owning_buffer : public buffer { public: + /** + * @brief Moves the input container into the newly created object. + */ owning_buffer(Container&& data_owner) : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size()) { } - // to create a view into an existing owning buffer - owning_buffer(Container&& data_owner, const uint8_t* data_ptr, size_t size) + + /** + * @brief Moves the input container into the newly created object, and exposes a subspan of the + * buffer. + */ + owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size) : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size) { } size_t size() const override { return _size; } - const uint8_t* data() const override { return static_cast(_data_ptr); } + uint8_t const* data() const override { return static_cast(_data_ptr); } private: Container _data; - void const* const _data_ptr; - size_t const _size; + void const* _data_ptr; + size_t _size; }; }; @@ -268,7 +298,7 @@ class arrow_io_source : public datasource { { } size_t size() const override { return arrow_buffer->size(); } - const uint8_t* data() const override { return arrow_buffer->data(); } + uint8_t const* data() const override { return arrow_buffer->data(); } }; public: diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 2f9b95bbb38..2eafe8d7d49 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -858,7 +858,7 @@ void reader::impl::read_column_chunks( page_data[chunk] = datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream)); } - uint8_t const *d_compdata = page_data[chunk]->data(); + auto d_compdata = page_data[chunk]->data(); do { chunks[chunk].compressed_data = d_compdata; d_compdata += chunks[chunk].compressed_size; From c1010a9184232c9a7542c6ac4e100c9097d562d3 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 4 Mar 2021 15:21:58 -0800 Subject: [PATCH 42/52] (c) year --- cpp/include/cudf/io/data_sink.hpp | 2 +- cpp/include/cudf/io/datasource.hpp | 2 +- cpp/src/io/csv/writer_impl.cu | 2 +- cpp/src/io/parquet/parquet_gpu.hpp | 2 +- cpp/src/io/parquet/reader_impl.cu | 2 +- cpp/src/io/parquet/reader_impl.hpp | 2 +- cpp/src/io/utilities/data_sink.cpp | 2 +- cpp/src/io/utilities/datasource.cpp | 2 +- cpp/src/io/utilities/file_io_utilities.hpp | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index 36792f7ca35..b3aaf79db6b 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 4f9fb741110..33732e681bd 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index cd35ea15ac0..f7e153d71f4 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 7c6735811a9..37704bf2621 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 2eafe8d7d49..16cf0877c23 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 0ea010a3cb4..ca200936134 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 4f1cf4030c0..7591b7bd7ec 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 838e2dd913e..62c4f9e9917 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 683dd6320f3..a23e844fa7c 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 13b3c6a0c9e46ee3e1eca907721f9eb7aeaa23f2 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 4 Mar 2021 15:22:14 -0800 Subject: [PATCH 43/52] one more (c) year --- cpp/src/io/utilities/file_io_utilities.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 0ffb6ae27b0..c92c2519377 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 67d901b514d405632de6c5c0980cf1940a1c13c8 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 4 Mar 2021 15:39:56 -0800 Subject: [PATCH 44/52] namespace --- cpp/src/io/utilities/data_sink.cpp | 5 +++-- cpp/src/io/utilities/datasource.cpp | 6 +++--- cpp/src/io/utilities/file_io_utilities.cpp | 14 ++------------ cpp/src/io/utilities/file_io_utilities.hpp | 10 ++++++---- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 7591b7bd7ec..10af7bcb0bd 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -29,7 +29,8 @@ namespace io { */ class file_sink : public data_sink { public: - explicit file_sink(std::string const& filepath) : _cufile_out(make_cufile_output(filepath)) + explicit file_sink(std::string const& filepath) + : _cufile_out(detail::make_cufile_output(filepath)) { _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file"); @@ -66,7 +67,7 @@ class file_sink : public data_sink { private: std::ofstream _output_stream; size_t _bytes_written = 0; - std::unique_ptr _cufile_out; + std::unique_ptr _cufile_out; }; /** diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 62c4f9e9917..3f2884d5b7d 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -46,9 +46,9 @@ class memory_mapped_source : public datasource { public: explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) - : _cufile_in(make_cufile_input(filepath)) + : _cufile_in(detail::make_cufile_input(filepath)) { - auto const file = file_wrapper(filepath, O_RDONLY); + auto const file = detail::file_wrapper(filepath, O_RDONLY); _file_size = file.size(); if (_file_size != 0) { map(file.desc(), offset, size); } } @@ -138,7 +138,7 @@ class memory_mapped_source : public datasource { void *_map_addr = nullptr; size_t _map_size = 0; size_t _map_offset = 0; - std::unique_ptr _cufile_in; + std::unique_ptr _cufile_in; }; /** diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index c92c2519377..eda06ac6986 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -29,6 +29,7 @@ namespace cudf { namespace io { +namespace detail { file_wrapper::file_wrapper(std::string const &filepath, int flags) : fd(open(filepath.c_str(), flags)) @@ -42,18 +43,6 @@ file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath); } -/** - * Returns the directory from which the libcudf.so is loaded. - */ -std::string get_libcudf_dir_path() -{ - Dl_info dl_info{}; - dladdr((void *)get_libcudf_dir_path, &dl_info); - std::string full_path{dl_info.dli_fname}; - auto const dir_path = full_path.substr(0, full_path.find_last_of('/') + 1); - return dir_path; -} - file_wrapper::~file_wrapper() { close(fd); } long file_wrapper::size() const @@ -261,5 +250,6 @@ std::unique_ptr make_cufile_output(std::string const &filepa return nullptr; } +}; // namespace detail }; // namespace io }; // namespace cudf diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index a23e844fa7c..641c138ea08 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -30,6 +30,8 @@ namespace cudf { namespace io { +namespace detail { + /** * @brief Class that provides RAII for file handling. */ @@ -48,7 +50,7 @@ class file_wrapper { /** * @brief Base class for cuFile input/output. * - * Contains the file handles and common API for cuFile input and output classes. + * Contains the common API for cuFile input and output classes. */ class cufile_io_base { public: @@ -65,8 +67,8 @@ class cufile_io_base { /** * @brief The read/write size above which cuFile is faster then host read + copy * - * This may not be the optimal threshold for all systems. `is_cufile_io_preferred` can use a - * different logic based on the system config. + * This may not be the optimal threshold for all systems. Derived `is_cufile_io_preferred` + * implementations can use a different logic. */ static constexpr size_t op_size_threshold = 128 << 10; }; @@ -213,6 +215,6 @@ std::unique_ptr make_cufile_input(std::string const &filepath * cuFile library is not installed. */ std::unique_ptr make_cufile_output(std::string const &filepath); - +}; // namespace detail }; // namespace io }; // namespace cudf From 851c0bd295d4b4fe7c27eeeb4e73d1cc285b1e55 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 4 Mar 2021 17:35:07 -0800 Subject: [PATCH 45/52] docs, comments; disable fallback w/ compat mode --- cpp/src/io/utilities/file_io_utilities.cpp | 38 +++++++++++++++------- cpp/src/io/utilities/file_io_utilities.hpp | 2 +- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index eda06ac6986..c4669ee63ec 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -57,11 +57,14 @@ long file_wrapper::size() const #ifdef CUFILE_FOUND +/** + * @brief Class that manages cuFile configuration. + */ class cufile_config { std::string const default_policy = "OFF"; std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON"; - bool enabled = false; + std::string const policy = default_policy; temp_directory tmp_config_dir{"cudf_cufile_config"}; std::string getenv_or(std::string const &env_var_name, std::string const &default_val) @@ -70,15 +73,13 @@ class cufile_config { return (env_val == nullptr) ? default_val : std::string(env_val); } - cufile_config() + cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)} { - auto const policy = getenv_or("LIBCUDF_CUFILE_POLICY", default_policy); - - enabled = (policy == "ALWAYS" || policy == "GDS"); - - if (enabled) { + if (is_enabled()) { + // Modify the config file based on the policy auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json"); std::ifstream user_config_file(config_file_path); + // Modified config file is stored in a temporary directory auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json"; std::ofstream cudf_config_file(cudf_config_path); @@ -87,11 +88,13 @@ class cufile_config { std::string const tag = "\"allow_compat_mode\""; if (line.find(tag) != std::string::npos) { // TODO: only replace the true/false value - cudf_config_file << tag << ": " << ((policy == "ALWAYS") ? "true" : "false") << ",\n"; + // Enable compatiblity mode when cuDF does not fall back to host path + cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n"; } else { cudf_config_file << line << '\n'; } + // Point libcufile to the modified config file CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0, "Failed to set the cuFile config file environment variable."); } @@ -99,7 +102,15 @@ class cufile_config { } public: - bool is_enabled() const { return enabled; } + /** + * @brief Returns true when cuFile use is enabled. + */ + bool is_enabled() const { return policy == "ALWAYS" || policy == "GDS"; } + + /** + * @brief Returns true when cuDF should not fall back to host IO. + */ + bool is_required() const { return policy == "ALWAYS"; } static cufile_config const *instance() { @@ -115,6 +126,10 @@ class cufile_shim { private: cufile_shim(); + void *cf_lib = nullptr; + decltype(cuFileDriverOpen) *driver_open = nullptr; + decltype(cuFileDriverClose) *driver_close = nullptr; + std::unique_ptr init_error; auto is_valid() const noexcept { return init_error == nullptr; } @@ -130,9 +145,6 @@ class cufile_shim { dlclose(cf_lib); } - void *cf_lib = nullptr; - decltype(cuFileDriverOpen) *driver_open = nullptr; - decltype(cuFileDriverClose) *driver_close = nullptr; decltype(cuFileHandleRegister) *handle_register = nullptr; decltype(cuFileHandleDeregister) *handle_deregister = nullptr; decltype(cuFileRead) *read = nullptr; @@ -231,6 +243,7 @@ std::unique_ptr make_cufile_input(std::string const &filepath try { return std::make_unique(filepath); } catch (...) { + if (cufile_config::instance()->is_required()) throw; } } #endif @@ -244,6 +257,7 @@ std::unique_ptr make_cufile_output(std::string const &filepa try { return std::make_unique(filepath); } catch (...) { + if (cufile_config::instance()->is_required()) throw; } } #endif diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 641c138ea08..761880615c5 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -29,7 +29,6 @@ namespace cudf { namespace io { - namespace detail { /** @@ -215,6 +214,7 @@ std::unique_ptr make_cufile_input(std::string const &filepath * cuFile library is not installed. */ std::unique_ptr make_cufile_output(std::string const &filepath); + }; // namespace detail }; // namespace io }; // namespace cudf From b853f976a1a3d02676be617e336eaf318fe4754d Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 4 Mar 2021 17:51:43 -0800 Subject: [PATCH 46/52] moar docs --- cpp/include/cudf/io/datasource.hpp | 10 ++++----- cpp/src/io/utilities/file_io_utilities.cpp | 2 +- cpp/src/io/utilities/file_io_utilities.hpp | 24 +++++++++++++++++++++- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 33732e681bd..b749dfb6a3c 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -170,8 +170,8 @@ class datasource { * @throws cudf::logic_error the object does not support direct device reads, i.e. * `supports_device_read` returns `false`. * - * @param offset Bytes from the start - * @param size Bytes to read + * @param offset Number of bytes from the start + * @param size Number of bytes to read * @param stream CUDA stream to use, default `rmm::cuda_stream_default` * * @return The data buffer in the device memory @@ -190,11 +190,11 @@ class datasource { * Data source implementations that don't support direct device reads don't need to override this * function. * - * @throws cudf::logic_error the object does not support direct device reads, i.e. + * @throws cudf::logic_error when the object does not support direct device reads, i.e. * `supports_device_read` returns `false`. * - * @param offset Bytes from the start - * @param size Bytes to read + * @param offset Number of bytes from the start + * @param size Number of bytes to read * @param dst Address of the existing device memory * @param stream CUDA stream to use, default `rmm::cuda_stream_default` * diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index c4669ee63ec..198292bf0ab 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -220,7 +220,7 @@ size_t cufile_input_impl::read(size_t offset, { CUDF_EXPECTS(shim->read(cf_file.handle(), dst, size, offset, 0) != -1, "cuFile error reading from a file"); - // have to read the requested size for now + // always read the requested size for now return size; } diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 761880615c5..880ce438e38 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -79,6 +79,14 @@ class cufile_input : public cufile_io_base { public: /** * @brief Reads into a new device buffer. + * + * @throws cudf::logic_error on cuFile error + * + * @param offset Number of bytes from the start + * @param size Number of bytes to read + * @param stream CUDA stream to use, default `rmm::cuda_stream_default` + * + * @return The data buffer in the device memory */ virtual std::unique_ptr read(size_t offset, size_t size, @@ -87,7 +95,14 @@ class cufile_input : public cufile_io_base { /** * @brief Reads into existing device memory. * - * Returns the number of bytes read. + * @throws cudf::logic_error on cuFile error + * + * @param offset Number of bytes from the start + * @param size Number of bytes to read + * @param dst Address of the existing device memory + * @param stream CUDA stream to use, default `rmm::cuda_stream_default` + * + * @return The number of bytes read */ virtual size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) = 0; }; @@ -99,6 +114,12 @@ class cufile_output : public cufile_io_base { public: /** * @brief Writes the data from a device buffer into a file. + * + * @throws cudf::logic_error on cuFile error + * + * @param data Pointer to the buffer to be written into the output file + * @param offset Number of bytes from the start + * @param size Number of bytes to write */ virtual void write(void const *data, size_t offset, size_t size) = 0; }; @@ -106,6 +127,7 @@ class cufile_output : public cufile_io_base { #ifdef CUFILE_FOUND class cufile_shim; + /** * @brief Class that provides RAII for cuFile file registration. */ From 750460fad9b4d2346662660cb43bf556036c8fb7 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 4 Mar 2021 18:00:52 -0800 Subject: [PATCH 47/52] remove includes --- cpp/src/io/utilities/file_io_utilities.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 198292bf0ab..7149982a7c0 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -16,15 +16,10 @@ #include #include -#include -#include -#include -#include -#include -#include - #include +#include + #include namespace cudf { From 96c5b53ce8e65b57612a1a1ee30ae996ffae5788 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 5 Mar 2021 13:39:35 -0800 Subject: [PATCH 48/52] CMake improvement --- cpp/CMakeLists.txt | 12 +++++------- cpp/cmake/Modules/FindcuFile.cmake | 6 ++++++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 024c3ef2f75..b68e07b944b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -427,13 +427,6 @@ if(CONDA_INCLUDE_DIRS) target_include_directories(cudf PUBLIC "$") endif() -# Add cuFile include paths if available -if(cuFile_FOUND) - target_include_directories(cudf PUBLIC "${cuFile_INCLUDE_DIRS}") - target_compile_options(cudf PUBLIC ${cuFile_COMPILE_OPTIONS}) - target_compile_definitions(cudf PUBLIC CUFILE_FOUND) -endif() - # Instruct jitify to use the kernel JIT cache if(JITIFY_USE_CACHE) target_compile_definitions(cudf PUBLIC JITIFY_USE_CACHE "CUDF_VERSION=${PROJECT_VERSION}") @@ -469,6 +462,11 @@ else() target_link_libraries(cudf PUBLIC CUDA::nvrtc CUDA::cudart CUDA::cuda_driver) endif() +# Add cuFile interface if available +if(cuFile_FOUND) + target_link_libraries(cudf PUBLIC cuFile::cuFile_interface) +endif() + file(WRITE "${CUDF_BINARY_DIR}/fatbin.ld" [=[ SECTIONS diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake index e67b79d9d60..77d04a70e29 100644 --- a/cpp/cmake/Modules/FindcuFile.cmake +++ b/cpp/cmake/Modules/FindcuFile.cmake @@ -93,6 +93,12 @@ find_package_handle_standard_args(cuFile cuFile_VERSION ) +if (cuFile_FOUND) + add_library(cuFile::cuFile_interface IMPORTED INTERFACE) + target_include_directories(cuFile::cuFile_interface INTERFACE "$") + target_compile_options(cuFile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}") + target_compile_definitions(cuFile::cuFile_interface INTERFACE CUFILE_FOUND) +endif () if (cuFile_FOUND AND NOT TARGET cuFile::cuFile) add_library(cuFile::cuFile UNKNOWN IMPORTED) From 6be526e8a2ccdb41403fe54374763ad0cec167d1 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 9 Mar 2021 14:50:02 -0800 Subject: [PATCH 49/52] address review feedback --- cpp/CMakeLists.txt | 4 ++-- cpp/benchmarks/fixture/benchmark_fixture.hpp | 2 +- cpp/include/cudf/io/data_sink.hpp | 4 ++-- cpp/include/cudf/io/datasource.hpp | 4 ++-- cpp/src/io/utilities/file_io_utilities.cpp | 8 ++++---- cpp/src/io/utilities/file_io_utilities.hpp | 10 +++++----- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b68e07b944b..af08f45ec1b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -463,8 +463,8 @@ else() endif() # Add cuFile interface if available -if(cuFile_FOUND) - target_link_libraries(cudf PUBLIC cuFile::cuFile_interface) +if(cuFile_FOUND AND NOT TARGET cuFile::cuFile_interface) + target_link_libraries(cudf PRIVATE cuFile::cuFile_interface) endif() file(WRITE "${CUDF_BINARY_DIR}/fatbin.ld" diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp index ad2ce095b6e..dd1bbcba0b4 100644 --- a/cpp/benchmarks/fixture/benchmark_fixture.hpp +++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp @@ -88,4 +88,4 @@ class benchmark : public ::benchmark::Fixture { std::shared_ptr mr; }; -}; // namespace cudf +} // namespace cudf diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index b3aaf79db6b..e0eb60af070 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -125,9 +125,9 @@ class data_sink { * @throws cudf::logic_error the object does not support direct device writes, i.e. * `supports_device_write` returns `false`. * - * @param data Pointer to the buffer to be written into the sink object + * @param gpu_data Pointer to the buffer to be written into the sink object * @param size Number of bytes to write - * @param stream CUDA stream to use, default `rmm::cuda_stream_default` + * @param stream CUDA stream to use */ virtual void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) { diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index b749dfb6a3c..8fcc045e6d2 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -172,7 +172,7 @@ class datasource { * * @param offset Number of bytes from the start * @param size Number of bytes to read - * @param stream CUDA stream to use, default `rmm::cuda_stream_default` + * @param stream CUDA stream to use * * @return The data buffer in the device memory */ @@ -196,7 +196,7 @@ class datasource { * @param offset Number of bytes from the start * @param size Number of bytes to read * @param dst Address of the existing device memory - * @param stream CUDA stream to use, default `rmm::cuda_stream_default` + * @param stream CUDA stream to use * * @return The number of bytes read (can be smaller than size) */ diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 7149982a7c0..22ff057cbc1 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -100,7 +100,7 @@ class cufile_config { /** * @brief Returns true when cuFile use is enabled. */ - bool is_enabled() const { return policy == "ALWAYS" || policy == "GDS"; } + bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; } /** * @brief Returns true when cuDF should not fall back to host IO. @@ -259,6 +259,6 @@ std::unique_ptr make_cufile_output(std::string const &filepa return nullptr; } -}; // namespace detail -}; // namespace io -}; // namespace cudf +} // namespace detail +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 880ce438e38..85399bdd44d 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -84,7 +84,7 @@ class cufile_input : public cufile_io_base { * * @param offset Number of bytes from the start * @param size Number of bytes to read - * @param stream CUDA stream to use, default `rmm::cuda_stream_default` + * @param stream CUDA stream to use * * @return The data buffer in the device memory */ @@ -100,7 +100,7 @@ class cufile_input : public cufile_io_base { * @param offset Number of bytes from the start * @param size Number of bytes to read * @param dst Address of the existing device memory - * @param stream CUDA stream to use, default `rmm::cuda_stream_default` + * @param stream CUDA stream to use * * @return The number of bytes read */ @@ -237,6 +237,6 @@ std::unique_ptr make_cufile_input(std::string const &filepath */ std::unique_ptr make_cufile_output(std::string const &filepath); -}; // namespace detail -}; // namespace io -}; // namespace cudf +} // namespace detail +} // namespace io +} // namespace cudf From 0ad1418cd45267199e2593fbe2fe50bb392a173d Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 12 Mar 2021 16:23:52 -0800 Subject: [PATCH 50/52] fix a CMake error --- cpp/CMakeLists.txt | 2 +- cpp/cmake/Modules/FindcuFile.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4dbd0eff422..1c4319485a3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -469,7 +469,7 @@ else() endif() # Add cuFile interface if available -if(cuFile_FOUND AND NOT TARGET cuFile::cuFile_interface) +if(cuFile_FOUND) target_link_libraries(cudf PRIVATE cuFile::cuFile_interface) endif() diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake index 77d04a70e29..1bb86310507 100644 --- a/cpp/cmake/Modules/FindcuFile.cmake +++ b/cpp/cmake/Modules/FindcuFile.cmake @@ -93,7 +93,7 @@ find_package_handle_standard_args(cuFile cuFile_VERSION ) -if (cuFile_FOUND) +if (cuFile_FOUND AND NOT TARGET cuFile::cuFile_interface) add_library(cuFile::cuFile_interface IMPORTED INTERFACE) target_include_directories(cuFile::cuFile_interface INTERFACE "$") target_compile_options(cuFile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}") From 380287ea2b7707966cdfebabd9c981ea00394f69 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 12 Mar 2021 16:55:01 -0800 Subject: [PATCH 51/52] allocate the pinned buffer only if used --- cpp/src/io/parquet/writer_impl.cu | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 4ee42b23641..dd68bc50043 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1108,12 +1108,7 @@ void writer::impl::write(table_view const &table) num_stats_bfr); } - auto host_bfr = pinned_buffer{[](size_t size) { - uint8_t *ptr = nullptr; - CUDA_TRY(cudaMallocHost(&ptr, size)); - return ptr; - }(max_chunk_bfr_size), - cudaFreeHost}; + pinned_buffer host_bfr{nullptr, cudaFreeHost}; // Encode row groups in batches for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size(); @@ -1163,6 +1158,14 @@ void writer::impl::write(table_view const &table) stream.synchronize(); } } else { + if (!host_bfr) { + host_bfr = pinned_buffer{[](size_t size) { + uint8_t *ptr = nullptr; + CUDA_TRY(cudaMallocHost(&ptr, size)); + return ptr; + }(max_chunk_bfr_size), + cudaFreeHost}; + } // copy the full data CUDA_TRY(cudaMemcpyAsync(host_bfr.get(), dev_bfr, From e5f12b989e15b772d263b5ac6cc930ed110fc1b6 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Mar 2021 14:12:06 -0700 Subject: [PATCH 52/52] fix up CMake for cufile header only builds --- cpp/CMakeLists.txt | 2 +- cpp/cmake/Modules/FindcuFile.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bba35f7da3a..3609a921c6f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -468,7 +468,7 @@ else() endif() # Add cuFile interface if available -if(cuFile_FOUND) +if(TARGET cuFile::cuFile_interface) target_link_libraries(cudf PRIVATE cuFile::cuFile_interface) endif() diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake index 1bb86310507..4f67e186f42 100644 --- a/cpp/cmake/Modules/FindcuFile.cmake +++ b/cpp/cmake/Modules/FindcuFile.cmake @@ -93,7 +93,7 @@ find_package_handle_standard_args(cuFile cuFile_VERSION ) -if (cuFile_FOUND AND NOT TARGET cuFile::cuFile_interface) +if (cuFile_INCLUDE_DIR AND NOT TARGET cuFile::cuFile_interface) add_library(cuFile::cuFile_interface IMPORTED INTERFACE) target_include_directories(cuFile::cuFile_interface INTERFACE "$") target_compile_options(cuFile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}")