From eba4f0312cc2b8315bbd950f7aa3c2680f70635f Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 5 Jan 2022 12:58:11 -0500 Subject: [PATCH] Add cudf::strings::extract_all API (#9909) Closes #9856 Adds a new `cudf::strings::extract_all` API that returns a LIST column of extracted strings given a regex pattern. This is similar to nvstrings version of `extract` called `extract_record` but returns groups from all matches in each string instead of just the first match. Here is pseudo code of it's behavior on various strings input: ``` s = [ "ABC-200 DEF-400", "GHI-60", "JK-800", "900", NULL ] r = extract_all( s, "'(\w+)-(\d+)" ) r is a LIST column of strings that looks like this: [ [ "ABC", "200", "DEF", "400" ], // 2 matches [ "GHI", "60" ], // 1 match [ "JK", "800" ], // 1 match NULL, // no match NULL ] ``` Each match results in two groups as specified in the regex pattern. Also reorganized the extract source code into `src/strings/extract` directory. The match-counting has been factored out into new `count_matches.cuh` since it will become common code used with `findall_record` in a follow on PR. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/9909 --- cpp/CMakeLists.txt | 3 +- .../detail/strings_column_factories.cuh | 8 +- cpp/include/cudf/strings/extract.hpp | 50 ++++- cpp/src/strings/count_matches.cuh | 105 ++++++++++ cpp/src/strings/{ => extract}/extract.cu | 0 cpp/src/strings/extract/extract_all.cu | 191 ++++++++++++++++++ cpp/tests/strings/extract_tests.cpp | 45 ++++- 7 files changed, 386 insertions(+), 16 deletions(-) create mode 100644 cpp/src/strings/count_matches.cuh rename cpp/src/strings/{ => extract}/extract.cu (100%) create mode 100644 cpp/src/strings/extract/extract_all.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 624293ad87c..84e486c7e18 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -417,7 +417,8 @@ add_library( src/strings/copying/concatenate.cu src/strings/copying/copying.cu src/strings/copying/shift.cu - src/strings/extract.cu + src/strings/extract/extract.cu + src/strings/extract/extract_all.cu src/strings/filling/fill.cu src/strings/filter_chars.cu src/strings/findall.cu diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index b35f5df2903..9da3c6b0e91 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -33,6 +33,12 @@ namespace cudf { namespace strings { namespace detail { +/** + * @brief Basic type expected for iterators passed to `make_strings_column` that represent string + * data in device memory. + */ +using string_index_pair = thrust::pair; + /** * @brief Average string byte-length threshold for deciding character-level * vs. row-level parallel algorithm. @@ -64,8 +70,6 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, size_type strings_count = thrust::distance(begin, end); if (strings_count == 0) return make_empty_column(type_id::STRING); - using string_index_pair = thrust::pair; - // check total size is not too large for cudf column auto size_checker = [] __device__(string_index_pair const& item) { return (item.first != nullptr) ? item.second : 0; diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index 6f5902266b2..466f71aace0 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,20 +27,21 @@ namespace strings { */ /** - * @brief Returns a vector of strings columns for each matching group specified in the given regular - * expression pattern. + * @brief Returns a table of strings columns where each column corresponds to the matching + * group specified in the given regular expression pattern. * * All the strings for the first group will go in the first output column; the second group - * go in the second column and so on. Null entries are added if the string does match. + * go in the second column and so on. Null entries are added to the columns in row `i` if + * the string at row `i` does not match. * * Any null string entries return corresponding null output column entries. * * @code{.pseudo} * Example: - * s = ["a1","b2","c3"] - * r = extract(s,"([ab])(\\d)") - * r is now [["a","b",null], - * ["1","2",null]] + * s = ["a1", "b2", "c3"] + * r = extract(s, "([ab])(\\d)") + * r is now [ ["a", "b", null], + * ["1", "2", null] ] * @endcode * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. @@ -55,6 +56,39 @@ std::unique_ptr extract( std::string const& pattern, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a lists column of strings where each string column row corresponds to the + * matching group specified in the given regular expression pattern. + * + * All the matching groups for the first row will go in the first row output column; the second + * row results will go into the second row output column and so on. + * + * A null output row will result if the corresponding input string row does not match or + * that input row is null. + * + * @code{.pseudo} + * Example: + * s = ["a1 b4", "b2", "c3 a5", "b", null] + * r = extract_all(s,"([ab])(\\d)") + * r is now [ ["a", "1", "b", "4"], + * ["b", "2"], + * ["a", "5"], + * null, + * null ] + * @endcode + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation. + * @param pattern The regular expression pattern with group indicators. + * @param mr Device memory resource used to allocate any returned device memory. + * @return Lists column containing strings extracted from the input column. + */ +std::unique_ptr extract_all( + strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/count_matches.cuh b/cpp/src/strings/count_matches.cuh new file mode 100644 index 00000000000..c14142f4779 --- /dev/null +++ b/cpp/src/strings/count_matches.cuh @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @brief Functor counts the total matches to the given regex in each string. + */ +template +struct count_matches_fn { + column_device_view const d_strings; + reprog_device prog; + + __device__ size_type operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return 0; } + size_type count = 0; + auto const d_str = d_strings.element(idx); + + int32_t begin = 0; + int32_t end = d_str.length(); + while ((begin < end) && (prog.find(idx, d_str, begin, end) > 0)) { + ++count; + begin = end; + end = d_str.length(); + } + return count; + } +}; + +/** + * @brief Returns a column of regex match counts for each string in the given column. + * + * A null entry will result in a zero count for that output row. + * + * @param d_strings Device view of the input strings column. + * @param d_prog Regex instance to evaluate on each string. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +std::unique_ptr count_matches( + column_device_view const& d_strings, + reprog_device const& d_prog, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + // Create output column + auto counts = make_numeric_column( + data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr); + auto d_counts = counts->mutable_view().data(); + + auto begin = thrust::make_counting_iterator(0); + auto end = thrust::make_counting_iterator(d_strings.size()); + + // Count matches + auto const regex_insts = d_prog.insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + count_matches_fn fn{d_strings, d_prog}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + count_matches_fn fn{d_strings, d_prog}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn); + } else if (regex_insts <= RX_LARGE_INSTS) { + count_matches_fn fn{d_strings, d_prog}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn); + } else { + count_matches_fn fn{d_strings, d_prog}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn); + } + + return counts; +} + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract/extract.cu similarity index 100% rename from cpp/src/strings/extract.cu rename to cpp/src/strings/extract/extract.cu diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu new file mode 100644 index 00000000000..584741298c2 --- /dev/null +++ b/cpp/src/strings/extract/extract_all.cu @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +namespace { + +/** + * @brief Functor extracts matched string pointers for each input string. + * + * For regex match within a string, the specified groups are extracted into + * the `d_indices` output vector. + * The `d_offsets` are pre-computed to identify the location of where each + * string's output groups are to be written. + */ +template +struct extract_fn { + column_device_view const d_strings; + reprog_device d_prog; + offset_type const* d_offsets; + string_index_pair* d_indices; + + __device__ void operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return; } + + auto const groups = d_prog.group_counts(); + auto d_output = d_indices + d_offsets[idx]; + size_type output_idx = 0; + + auto const d_str = d_strings.element(idx); + + int32_t begin = 0; + int32_t end = d_str.length(); + // match the regex + while ((begin < end) && d_prog.find(idx, d_str, begin, end) > 0) { + // extract each group into the output + for (auto group_idx = 0; group_idx < groups; ++group_idx) { + // result is an optional containing the bounds of the extracted string at group_idx + auto const extracted = d_prog.extract(idx, d_str, begin, end, group_idx); + + d_output[group_idx + output_idx] = [&] { + if (!extracted) { return string_index_pair{nullptr, 0}; } + auto const start_offset = d_str.byte_offset(extracted->first); + auto const end_offset = d_str.byte_offset(extracted->second); + return string_index_pair{d_str.data() + start_offset, end_offset - start_offset}; + }(); + } + // continue to next match + begin = end; + end = d_str.length(); + output_idx += groups; + } + } +}; +} // namespace + +/** + * @copydoc cudf::strings::extract_all + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr extract_all( + strings_column_view const& strings, + std::string const& pattern, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto const strings_count = strings.size(); + auto const d_strings = column_device_view::create(strings.parent(), stream); + + // Compile regex into device object. + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + // The extract pattern should always include groups. + auto const groups = d_prog->group_counts(); + CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern."); + + // Get the match counts for each string. + // This column will become the output lists child offsets column. + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + // Compute null output rows + auto [null_mask, null_count] = cudf::detail::valid_if( + d_offsets, d_offsets + strings_count, [] __device__(auto v) { return v > 0; }, stream, mr); + + // Return an empty lists column if there are no valid rows + if (strings_count == null_count) { + return make_lists_column(0, + make_empty_column(type_to_id()), + make_empty_column(type_id::STRING), + 0, + rmm::device_buffer{}, + stream, + mr); + } + + // Convert counts into offsets. + // Multiply each count by the number of groups. + thrust::transform_exclusive_scan( + rmm::exec_policy(stream), + d_offsets, + d_offsets + strings_count + 1, + d_offsets, + [groups] __device__(auto v) { return v * groups; }, + offset_type{0}, + thrust::plus{}); + auto const total_groups = + cudf::detail::get_value(offsets->view(), strings_count, stream); + + // Create an indices vector with the total number of groups that will be extracted. + rmm::device_uvector indices(total_groups, stream); + auto d_indices = indices.data(); + auto begin = thrust::make_counting_iterator(0); + + // Call the extract functor to fill in the indices vector. + auto const regex_insts = d_prog->insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + extract_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + extract_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else if (regex_insts <= RX_LARGE_INSTS) { + extract_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else { + extract_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } + + // Build the child strings column from the indices. + auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr); + + // Build the lists column from the offsets and the strings. + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + null_count, + std::move(null_mask), + stream, + mr); +} + +} // namespace detail + +// external API + +std::unique_ptr extract_all(strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_all(strings, pattern, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 824bf7deb34..2bb1c6dac8e 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,17 @@ * limitations under the License. */ -#include -#include -#include +#include + #include #include #include #include -#include + +#include +#include +#include +#include #include @@ -169,6 +172,38 @@ TEST_F(StringsExtractTests, EmptyExtractTest) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected); } +TEST_F(StringsExtractTests, ExtractAllTest) +{ + std::vector h_input( + {"123 banana 7 eleven", "41 apple", "6 pear 0 pair", nullptr, "", "bees", "4 pare"}); + auto validity = + thrust::make_transform_iterator(h_input.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity); + auto sv = cudf::strings_column_view(input); + + auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)"); + + bool valids[] = {1, 1, 1, 0, 0, 0, 1}; + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{"123", "banana", "7", "eleven"}, + LCW{"41", "apple"}, + LCW{"6", "pear", "0", "pair"}, + LCW{}, + LCW{}, + LCW{}, + LCW{"4", "pare"}}, + valids); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + +TEST_F(StringsExtractTests, Errors) +{ + cudf::test::strings_column_wrapper input({"this column intentionally left blank"}); + auto sv = cudf::strings_column_view(input); + EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::extract_all(sv, "\\w+"), cudf::logic_error); +} + TEST_F(StringsExtractTests, MediumRegex) { // This results in 95 regex instructions and falls in the 'medium' range.