Add cudf::explode_position (#7376)

This PR adds support for pos_explode in cuDF. It is very similar to explode, but includes a column with the index of the array that was exploded on each row. ``` a b [0, 1] 100 [2] 200 [3, 4, 5] 300 ``` exploded on column a would result in ``` a pos b 0 0 100 1 1 100 2 0 200 3 0 300 4 1 300 5 2 300 ``` partially fixes #6151 Authors: - Mike Wilson (@hyperbolic2346) Approvers: - David (@davidwendt) - Jake Hemstad (@jrhemstad) URL: #7376
rapidsai · Feb 26, 2021 · 9ae85ae · 9ae85ae
1 parent 7526be7
commit 9ae85ae
Show file tree

Hide file tree

Showing 4 changed files with 210 additions and 41 deletions.
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,6 +56,10 @@ class lists_column_view : private column_view {
   using column_view::null_mask;
   using column_view::offset;
   using column_view::size;
+  using offset_type = int32_t;
+  static_assert(std::is_same<offset_type, size_type>::value,
+                "offset_type is expected to be the same as size_type.");
+  using offset_iterator = offset_type const*;
 
   /**
    * @brief Returns the parent column.
@@ -87,6 +91,23 @@ class lists_column_view : private column_view {
    * @throw cudf::logic error if this is an empty column
    */
   column_view get_sliced_child(rmm::cuda_stream_view stream) const;
+
+  /**
+   * @brief Return first offset (accounting for column offset)
+   *
+   * @return int32_t const* Pointer to the first offset
+   */
+  offset_iterator offsets_begin() const noexcept
+  {
+    return offsets().begin<offset_type>() + offset();
+  }
+
+  /**
+   * @brief Return one past the last offset
+   *
+   * @return int32_t const* Pointer to one past the last offset
+   */
+  offset_iterator offsets_end() const noexcept { return offsets_begin() + size(); }
 };
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,8 +125,8 @@ std::unique_ptr<column> byte_cast(
  * [null,        100],
  * [15,          100],
  * ```
- * Note that null lists are completely removed from the output
- * and nulls and empty lists inside lists are pulled out and remain.
+ * Note that null lists are not included in the resulting table, but nulls inside
+ * lists and empty lists will be represented with a null entry for that column in that row.
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
@@ -139,6 +139,49 @@ std::unique_ptr<table> explode(
   size_type explode_column_idx,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Explodes a list column's elements and includes a position column.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. A position
+ * column is added that has the index inside the original list for each row. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [5,   0,    100],
+ * [10,  1,    100],
+ * [15,  2,    100],
+ * [20,  0,    200],
+ * [25,  1,    200],
+ * [30,  0,    300],
+ * ```
+ *
+ * Nulls and empty lists propagate in different ways depending on what is null or empty.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [5,    0,     100],
+ * [null, 1,     100],
+ * [15,   2,     100],
+ * ```
+ * Note that null lists are not included in the resulting table, but nulls inside
+ * lists and empty lists will be represented with a null entry for that column in that row.
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with explode_col exploded.
+ */
+std::unique_ptr<table> explode_position(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu
@@ -15,7 +15,7 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/gather.hpp>
+#include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/reshape.hpp>
@@ -40,9 +40,13 @@ namespace {
  * @brief Function object for exploding a column.
  */
 struct explode_functor {
+  /**
+   * @brief Function object for exploding a column.
+   */
   template <typename T>
   std::unique_ptr<table> operator()(table_view const& input_table,
-                                    size_type explode_column_idx,
+                                    size_type const explode_column_idx,
+                                    bool include_pos,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr) const
   {
@@ -55,58 +59,72 @@ struct explode_functor {
 template <>
 std::unique_ptr<table> explode_functor::operator()<list_view>(
   table_view const& input_table,
-  size_type explode_column_idx,
+  size_type const explode_column_idx,
+  bool include_pos,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr) const
 {
-  /* we explode by building a gather map that includes the number of entries in each list inside
-   the column for each index. Interestingly, this can be done with lower_bound across the offsets
-   as values between the offsets will all map down to the index below. We have some off-by-one
-   manipulations we need to do with the output, but it's almost our gather map by itself. Once we
-   build the gather map we need to remove the explode column from the table and run gather on it.
-   Next we build the explode column, which turns out is simply lifting the child column out of the
-   explode column. This unrolls the top level of lists. Then we need to insert the explode column
-   back into the table and return it. */
   lists_column_view lc{input_table.column(explode_column_idx)};
   auto sliced_child = lc.get_sliced_child(stream);
-  rmm::device_uvector<size_type> gather_map_indices(sliced_child.size(), stream, mr);
+  rmm::device_uvector<size_type> gather_map_indices(sliced_child.size(), stream);
 
-  // sliced columns can make this a little tricky. We have to start iterating at the start of the
-  // offsets for this column, which could be > 0. Then we also have to handle rebasing the offsets
-  // as we go.
-  auto offsets           = lc.offsets().begin<size_type>() + lc.offset();
+  // Sliced columns may require rebasing of the offsets.
+  auto offsets = lc.offsets_begin();
+  // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
   auto offsets_minus_one = thrust::make_transform_iterator(
-    offsets, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+    offsets + 1, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
   auto counting_iter = thrust::make_counting_iterator(0);
 
+  rmm::device_uvector<size_type> pos(include_pos ? sliced_child.size() : 0, stream, mr);
+
   // This looks like an off-by-one bug, but what is going on here is that we need to reduce each
-  // result from `lower_bound` by 1 to build the correct gather map. It was pointed out that
-  // this can be accomplished by simply skipping the first entry and using the result of
-  // `lower_bound` directly.
-  thrust::lower_bound(rmm::exec_policy(stream),
-                      offsets_minus_one + 1,
-                      offsets_minus_one + lc.size() + 1,
-                      counting_iter,
-                      counting_iter + gather_map_indices.size(),
-                      gather_map_indices.begin());
+  // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by
+  // skipping the first entry and using the result of `lower_bound` directly.
+  if (include_pos) {
+    thrust::transform(
+      rmm::exec_policy(stream),
+      counting_iter,
+      counting_iter + gather_map_indices.size(),
+      gather_map_indices.begin(),
+      [position_array = pos.data(), offsets_minus_one, offsets, offset_size = lc.size()] __device__(
+        auto idx) -> size_type {
+        auto lb_idx = thrust::lower_bound(
+                        thrust::seq, offsets_minus_one, offsets_minus_one + offset_size, idx) -
+                      offsets_minus_one;
+        position_array[idx] = idx - (offsets[lb_idx] - offsets[0]);
+        return lb_idx;
+      });
+  } else {
+    thrust::lower_bound(rmm::exec_policy(stream),
+                        offsets_minus_one,
+                        offsets_minus_one + lc.size(),
+                        counting_iter,
+                        counting_iter + gather_map_indices.size(),
+                        gather_map_indices.begin());
+  }
 
   auto select_iter = thrust::make_transform_iterator(
     thrust::make_counting_iterator(0),
     [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; });
   std::vector<size_type> selected_columns(select_iter, select_iter + input_table.num_columns() - 1);
 
-  auto gathered_table = cudf::detail::gather(
-    input_table.select(selected_columns),
-    column_view(data_type(type_to_id<size_type>()), sliced_child.size(), gather_map_indices.data()),
-    cudf::out_of_bounds_policy::DONT_CHECK,
-    cudf::detail::negative_index_policy::ALLOWED,
-    stream,
-    mr);
+  auto gathered_table = cudf::detail::gather(input_table.select(selected_columns),
+                                             gather_map_indices.begin(),
+                                             gather_map_indices.end(),
+                                             cudf::out_of_bounds_policy::DONT_CHECK,
+                                             stream,
+                                             mr);
 
   std::vector<std::unique_ptr<column>> columns = gathered_table.release()->release();
 
   columns.insert(columns.begin() + explode_column_idx,
-                 std::make_unique<column>(column(sliced_child, stream, mr)));
+                 std::make_unique<column>(sliced_child, stream, mr));
+
+  if (include_pos) {
+    columns.insert(columns.begin() + explode_column_idx,
+                   std::make_unique<column>(
+                     data_type(type_to_id<size_type>()), sliced_child.size(), pos.release()));
+  }
 
   return std::make_unique<table>(std::move(columns));
 }
@@ -120,13 +138,15 @@ std::unique_ptr<table> explode_functor::operator()<list_view>(
  */
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type explode_column_idx,
+                               bool include_pos,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   return type_dispatcher(input_table.column(explode_column_idx).type(),
                          explode_functor{},
                          input_table,
                          explode_column_idx,
+                         include_pos,
                          stream,
                          mr);
 }
@@ -141,7 +161,18 @@ std::unique_ptr<table> explode(table_view const& input_table,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr);
+  return detail::explode(input_table, explode_column_idx, false, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::explode_position(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode_position(table_view const& input_table,
+                                        size_type explode_column_idx,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::explode(input_table, explode_column_idx, true, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf