rapidsai · trevorsm7 · Nov 28, 2019 · Nov 22, 2019 · Nov 22, 2019 · Nov 22, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@
 - PR #3161 Move merge files to legacy
 - PR #3079 Added support to write ORC files given a local path
 - PR #3192 Add dtype param to cast `DataFrame` on init
+- PR #3213 Port cuIO to libcudf++
 - PR #3222 Add nvtext character tokenizer
 - PR #3223 Java expose underlying buffers
 - PR #3300 Add `DataFrame.insert`
@@ -145,6 +146,7 @@
 - PR #3425 Strings column copy_if_else implementation
 - PR #3422 Move utilities to legacy
 - PR #3201 Define and implement new datetime_ops APIs
+- PR #3448 Port scatter_to_tables to libcudf++
 
 ## Bug Fixes
 
@@ -203,6 +205,7 @@
 - PR #3424 Fix benchmark build by adding libcudacxx to benchmark's CMakeLists.txt
 - PR #3435 Fix diff and shift for empty series
 - PR #3439 Fix index-name bug in StringColumn concat
+- PR #3445 Fix ORC Writer default stripe size
 
 # cuDF 0.10.0 (16 Oct 2019)
 

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -436,9 +436,12 @@ add_library(cudf
             src/io/avro/legacy/avro_reader_impl.cu
             src/io/avro/avro_gpu.cu
             src/io/avro/avro.cpp
+            src/io/avro/reader_impl.cu
             src/io/csv/legacy/csv_reader_impl.cu
             src/io/csv/legacy/csv_writer.cu
             src/io/csv/legacy/csv_gpu.cu
+            src/io/csv/csv_gpu.cu
+            src/io/csv/reader_impl.cu
             src/io/json/legacy/json_reader_impl.cu
             src/io/orc/legacy/orc_reader_impl.cu
             src/io/orc/legacy/orc_writer_impl.cu
@@ -448,17 +451,21 @@ add_library(cudf
             src/io/orc/stripe_init.cu
             src/io/orc/stripe_enc.cu
             src/io/orc/dict_enc.cu
+            src/io/orc/reader_impl.cu
+            src/io/orc/writer_impl.cu
             src/io/parquet/page_data.cu
             src/io/parquet/page_hdr.cu
             src/io/parquet/legacy/parquet_reader_impl.cu
             src/io/parquet/parquet.cpp
+            src/io/parquet/reader_impl.cu
             src/io/comp/cpu_unbz2.cpp
             src/io/comp/uncomp.cpp
             src/io/comp/brotli_dict.cpp
             src/io/comp/debrotli.cu
             src/io/comp/snap.cu
             src/io/comp/unsnap.cu
             src/io/comp/gpuinflate.cu
+            src/io/functions.cpp
             src/io/utilities/datasource.cpp
             src/io/utilities/legacy/parsing_utils.cu
             src/utilities/legacy/cuda_utils.cu

@@ -129,6 +129,43 @@ std::unique_ptr<table> scatter(
     table_view const& target, bool check_bounds = false,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
+/**
+ * @brief Scatters the rows of a table to `n` tables according to a partition map
+ *
+ * Copies the rows from the input table to new tables according to the table
+ * indices given by partition_map. The number of output tables is one more than
+ * the maximum value in `partition_map`.
+ * 
+ * If a value in [0, n] does not appear in partition_map, then the corresponding
+ * output table will be empty.
+ *
+ * @throw cudf::logic_error when partition_map is a non-integer type
+ * @throw cudf::logic_error when partition_map is larger than input
+ * @throw cudf::logic_error when partition_map has nulls
+ *
+ * Example:
+ * input:         [{10, 12, 14, 16, 18, 20, 22, 24, 26, 28},
+ *                 { 1,  2,  3,  4, null, 0, 2,  4,  6,  2}]
+ * partition_map: {3,  4,  3,  1,  4,  4,  0,  1,  1,  1}
+ * output:     {[{22}, {2}], 
+ *              [{16, 24, 26, 28}, {4, 4, 6, 2}],
+ *              [{}, {}],
+ *              [{10, 14}, {1, 3}],
+ *              [{12, 18, 20}, {2, null, 0}]}
+ *
+ * @param input Table whose rows will be partitioned into a set of
+ * tables according to `partition_map`
+ * @param partition_map  Non-null column of integer values that map
+ * each row in `input` table into one of the output tables
+ * @param mr The resource to use for all allocations
+ *
+ * @return A vector of tables containing the scattered rows of `input`.
+ * `table` `i` contains all rows `j` from `input` where `partition_map[j] == i`.
+ */
+std::vector<std::unique_ptr<table>> scatter_to_tables(
+    table_view const& input, column_view const& partition_map,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
+
 /** ---------------------------------------------------------------------------*
 * @brief Indicates when to allocate a mask, based on an existing mask.
 * ---------------------------------------------------------------------------**/

@@ -105,6 +105,45 @@ std::unique_ptr<table> scatter(
     rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
     cudaStream_t stream = 0);
 
+/**
+ * @brief Scatters the rows of a table to `n` tables according to a partition map
+ *
+ * Copies the rows from the input table to new tables according to the table
+ * indices given by partition_map. The number of output tables is one more than
+ * the maximum value in `partition_map`.
+ * 
+ * If a value in [0, n] does not appear in partition_map, then the corresponding
+ * output table will be empty.
+ *
+ * @throw cudf::logic_error when partition_map is a non-integer type
+ * @throw cudf::logic_error when partition_map is larger than input
+ * @throw cudf::logic_error when partition_map has nulls
+ *
+ * Example:
+ * input:         [{10, 12, 14, 16, 18, 20, 22, 24, 26, 28},
+ *                 { 1,  2,  3,  4, null, 0, 2,  4,  6,  2}]
+ * partition_map: {3,  4,  3,  1,  4,  4,  0,  1,  1,  1}
+ * output:     {[{22}, {2}], 
+ *              [{16, 24, 26, 28}, {4, 4, 6, 2}],
+ *              [{}, {}],
+ *              [{10, 14}, {1, 3}],
+ *              [{12, 18, 20}, {2, null, 0}]}
+ *
+ * @param input Table whose rows will be partitioned into a set of
+ * tables according to `partition_map`
+ * @param partition_map  Non-null column of integer values that map
+ * each row in `input` table into one of the output tables
+ * @param mr The resource to use for all allocations
+ * @param stream The stream to use for CUDA operations
+ *
+ * @return A vector of tables containing the scattered rows of `input`.
+ * `table` `i` contains all rows `j` from `input` where `partition_map[j] == i`.
+ */
+std::vector<std::unique_ptr<table>> scatter_to_tables(
+    table_view const& input, column_view const& partition_map,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
+    cudaStream_t stream = 0);
+
 }  // namespace detail
 }  // namespace experimental
 }  // namespace cudf