From eba4f0312cc2b8315bbd950f7aa3c2680f70635f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 5 Jan 2022 12:58:11 -0500
Subject: [PATCH] Add cudf::strings::extract_all API (#9909)

Closes #9856

Adds a new `cudf::strings::extract_all` API that returns a LIST column of extracted strings given a regex pattern.

This is similar to nvstrings version of `extract` called `extract_record` but returns groups from all matches in each string instead of just the first match. Here is pseudo code of it's behavior on various strings input:
```
s = [ "ABC-200 DEF-400", "GHI-60", "JK-800", "900", NULL ]
r =  extract_all( s, "'(\w+)-(\d+)" )
r is a LIST column of strings that looks like this:

[ [ "ABC", "200", "DEF", "400" ], // 2 matches
  [ "GHI", "60" ], // 1 match
  [ "JK", "800" ], // 1 match
  NULL,            // no match
  NULL
]
```
Each match results in two groups as specified in the regex pattern.

Also reorganized the extract source code into `src/strings/extract` directory.
The match-counting has been factored out into new `count_matches.cuh` since it will become common code used with `findall_record` in a follow on PR.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9909
---
 cpp/CMakeLists.txt                            |   3 +-
 .../detail/strings_column_factories.cuh       |   8 +-
 cpp/include/cudf/strings/extract.hpp          |  50 ++++-
 cpp/src/strings/count_matches.cuh             | 105 ++++++++++
 cpp/src/strings/{ => extract}/extract.cu      |   0
 cpp/src/strings/extract/extract_all.cu        | 191 ++++++++++++++++++
 cpp/tests/strings/extract_tests.cpp           |  45 ++++-
 7 files changed, 386 insertions(+), 16 deletions(-)
 create mode 100644 cpp/src/strings/count_matches.cuh
 rename cpp/src/strings/{ => extract}/extract.cu (100%)
 create mode 100644 cpp/src/strings/extract/extract_all.cu
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 624293ad87c..84e486c7e18 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -417,7 +417,8 @@ add_library(
   src/strings/copying/concatenate.cu
   src/strings/copying/copying.cu
   src/strings/copying/shift.cu
-  src/strings/extract.cu
+  src/strings/extract/extract.cu
+  src/strings/extract/extract_all.cu
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
   src/strings/findall.cu
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index b35f5df2903..9da3c6b0e91 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -33,6 +33,12 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+/**
+ * @brief Basic type expected for iterators passed to `make_strings_column` that represent string
+ * data in device memory.
+ */
+using string_index_pair = thrust::pair<const char*, size_type>;
+
 /**
  * @brief Average string byte-length threshold for deciding character-level
  * vs. row-level parallel algorithm.
@@ -64,8 +70,6 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   size_type strings_count = thrust::distance(begin, end);
   if (strings_count == 0) return make_empty_column(type_id::STRING);
 
-  using string_index_pair = thrust::pair<const char*, size_type>;
-
   // check total size is not too large for cudf column
   auto size_checker = [] __device__(string_index_pair const& item) {
     return (item.first != nullptr) ? item.second : 0;
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index 6f5902266b2..466f71aace0 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,20 +27,21 @@ namespace strings {
  */
 
 /**
- * @brief Returns a vector of strings columns for each matching group specified in the given regular
- * expression pattern.
+ * @brief Returns a table of strings columns where each column corresponds to the matching
+ * group specified in the given regular expression pattern.
  *
  * All the strings for the first group will go in the first output column; the second group
- * go in the second column and so on. Null entries are added if the string does match.
+ * go in the second column and so on. Null entries are added to the columns in row `i` if
+ * the string at row `i` does not match.
  *
  * Any null string entries return corresponding null output column entries.
  *
  * @code{.pseudo}
  * Example:
- * s = ["a1","b2","c3"]
- * r = extract(s,"([ab])(\\d)")
- * r is now [["a","b",null],
- *           ["1","2",null]]
+ * s = ["a1", "b2", "c3"]
+ * r = extract(s, "([ab])(\\d)")
+ * r is now [ ["a", "b", null],
+ *            ["1", "2", null] ]
  * @endcode
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
@@ -55,6 +56,39 @@ std::unique_ptr<table> extract(
   std::string const& pattern,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a lists column of strings where each string column row corresponds to the
+ * matching group specified in the given regular expression pattern.
+ *
+ * All the matching groups for the first row will go in the first row output column; the second
+ * row results will go into the second row output column and so on.
+ *
+ * A null output row will result if the corresponding input string row does not match or
+ * that input row is null.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["a1 b4", "b2", "c3 a5", "b", null]
+ * r = extract_all(s,"([ab])(\\d)")
+ * r is now [ ["a", "1", "b", "4"],
+ *            ["b", "2"],
+ *            ["a", "5"],
+ *            null,
+ *            null ]
+ * @endcode
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation.
+ * @param pattern The regular expression pattern with group indicators.
+ * @param mr Device memory resource used to allocate any returned device memory.
+ * @return Lists column containing strings extracted from the input column.
+ */
+std::unique_ptr<column> extract_all(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/count_matches.cuh b/cpp/src/strings/count_matches.cuh
new file mode 100644
index 00000000000..c14142f4779
--- /dev/null
+++ b/cpp/src/strings/count_matches.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <strings/regex/regex.cuh>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/strings/string_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * @brief Functor counts the total matches to the given regex in each string.
+ */
+template <int stack_size>
+struct count_matches_fn {
+  column_device_view const d_strings;
+  reprog_device prog;
+
+  __device__ size_type operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) { return 0; }
+    size_type count  = 0;
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    int32_t begin = 0;
+    int32_t end   = d_str.length();
+    while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
+      ++count;
+      begin = end;
+      end   = d_str.length();
+    }
+    return count;
+  }
+};
+
+/**
+ * @brief Returns a column of regex match counts for each string in the given column.
+ *
+ * A null entry will result in a zero count for that output row.
+ *
+ * @param d_strings Device view of the input strings column.
+ * @param d_prog Regex instance to evaluate on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<column> count_matches(
+  column_device_view const& d_strings,
+  reprog_device const& d_prog,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  // Create output column
+  auto counts = make_numeric_column(
+    data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr);
+  auto d_counts = counts->mutable_view().data<offset_type>();
+
+  auto begin = thrust::make_counting_iterator<size_type>(0);
+  auto end   = thrust::make_counting_iterator<size_type>(d_strings.size());
+
+  // Count matches
+  auto const regex_insts = d_prog.insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    count_matches_fn<RX_STACK_SMALL> fn{d_strings, d_prog};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    count_matches_fn<RX_STACK_MEDIUM> fn{d_strings, d_prog};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    count_matches_fn<RX_STACK_LARGE> fn{d_strings, d_prog};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
+  } else {
+    count_matches_fn<RX_STACK_ANY> fn{d_strings, d_prog};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
+  }
+
+  return counts;
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract/extract.cu
similarity index 100%
rename from cpp/src/strings/extract.cu
rename to cpp/src/strings/extract/extract.cu
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
new file mode 100644
index 00000000000..584741298c2
--- /dev/null
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <strings/count_matches.cuh>
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/extract.hpp>
+#include <cudf/strings/string_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/transform_scan.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+namespace {
+
+/**
+ * @brief Functor extracts matched string pointers for each input string.
+ *
+ * For regex match within a string, the specified groups are extracted into
+ * the `d_indices` output vector.
+ * The `d_offsets` are pre-computed to identify the location of where each
+ * string's output groups are to be written.
+ */
+template <int stack_size>
+struct extract_fn {
+  column_device_view const d_strings;
+  reprog_device d_prog;
+  offset_type const* d_offsets;
+  string_index_pair* d_indices;
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) { return; }
+
+    auto const groups    = d_prog.group_counts();
+    auto d_output        = d_indices + d_offsets[idx];
+    size_type output_idx = 0;
+
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    int32_t begin = 0;
+    int32_t end   = d_str.length();
+    // match the regex
+    while ((begin < end) && d_prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+      // extract each group into the output
+      for (auto group_idx = 0; group_idx < groups; ++group_idx) {
+        // result is an optional containing the bounds of the extracted string at group_idx
+        auto const extracted = d_prog.extract<stack_size>(idx, d_str, begin, end, group_idx);
+
+        d_output[group_idx + output_idx] = [&] {
+          if (!extracted) { return string_index_pair{nullptr, 0}; }
+          auto const start_offset = d_str.byte_offset(extracted->first);
+          auto const end_offset   = d_str.byte_offset(extracted->second);
+          return string_index_pair{d_str.data() + start_offset, end_offset - start_offset};
+        }();
+      }
+      // continue to next match
+      begin = end;
+      end   = d_str.length();
+      output_idx += groups;
+    }
+  }
+};
+}  // namespace
+
+/**
+ * @copydoc cudf::strings::extract_all
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> extract_all(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const strings_count = strings.size();
+  auto const d_strings     = column_device_view::create(strings.parent(), stream);
+
+  // Compile regex into device object.
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  // The extract pattern should always include groups.
+  auto const groups = d_prog->group_counts();
+  CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern.");
+
+  // Get the match counts for each string.
+  // This column will become the output lists child offsets column.
+  auto offsets   = count_matches(*d_strings, *d_prog, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<offset_type>();
+
+  // Compute null output rows
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    d_offsets, d_offsets + strings_count, [] __device__(auto v) { return v > 0; }, stream, mr);
+
+  // Return an empty lists column if there are no valid rows
+  if (strings_count == null_count) {
+    return make_lists_column(0,
+                             make_empty_column(type_to_id<offset_type>()),
+                             make_empty_column(type_id::STRING),
+                             0,
+                             rmm::device_buffer{},
+                             stream,
+                             mr);
+  }
+
+  // Convert counts into offsets.
+  // Multiply each count by the number of groups.
+  thrust::transform_exclusive_scan(
+    rmm::exec_policy(stream),
+    d_offsets,
+    d_offsets + strings_count + 1,
+    d_offsets,
+    [groups] __device__(auto v) { return v * groups; },
+    offset_type{0},
+    thrust::plus{});
+  auto const total_groups =
+    cudf::detail::get_value<offset_type>(offsets->view(), strings_count, stream);
+
+  // Create an indices vector with the total number of groups that will be extracted.
+  rmm::device_uvector<string_index_pair> indices(total_groups, stream);
+  auto d_indices = indices.data();
+  auto begin     = thrust::make_counting_iterator<size_type>(0);
+
+  // Call the extract functor to fill in the indices vector.
+  auto const regex_insts = d_prog->insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    extract_fn<RX_STACK_SMALL> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    extract_fn<RX_STACK_MEDIUM> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    extract_fn<RX_STACK_LARGE> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else {
+    extract_fn<RX_STACK_ANY> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  }
+
+  // Build the child strings column from the indices.
+  auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr);
+
+  // Build the lists column from the offsets and the strings.
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           null_count,
+                           std::move(null_mask),
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+
+// external API
+
+std::unique_ptr<column> extract_all(strings_column_view const& strings,
+                                    std::string const& pattern,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract_all(strings, pattern, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 824bf7deb34..2bb1c6dac8e 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,17 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/extract.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table_view.hpp>
+#include <tests/strings/utilities.h>
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/strings/extract.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
 
 #include <vector>
 
@@ -169,6 +172,38 @@ TEST_F(StringsExtractTests, EmptyExtractTest)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected);
 }
 
+TEST_F(StringsExtractTests, ExtractAllTest)
+{
+  std::vector<const char*> h_input(
+    {"123 banana 7 eleven", "41 apple", "6 pear 0 pair", nullptr, "", "bees", "4 pare"});
+  auto validity =
+    thrust::make_transform_iterator(h_input.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)");
+
+  bool valids[] = {1, 1, 1, 0, 0, 0, 1};
+  using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{"123", "banana", "7", "eleven"},
+                LCW{"41", "apple"},
+                LCW{"6", "pear", "0", "pair"},
+                LCW{},
+                LCW{},
+                LCW{},
+                LCW{"4", "pare"}},
+               valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
+TEST_F(StringsExtractTests, Errors)
+{
+  cudf::test::strings_column_wrapper input({"this column intentionally left blank"});
+  auto sv = cudf::strings_column_view(input);
+  EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::extract_all(sv, "\\w+"), cudf::logic_error);
+}
+
 TEST_F(StringsExtractTests, MediumRegex)
 {
   // This results in 95 regex instructions and falls in the 'medium' range.