diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index dfc340b1459..e63ea38a31b 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -185,6 +185,7 @@ ConfigureBench(STRINGS_BENCH string/convert_floats_benchmark.cpp string/copy_benchmark.cpp string/extract_benchmark.cpp + string/factory_benchmark.cu string/filter_benchmark.cpp string/find_benchmark.cpp string/replace_benchmark.cpp diff --git a/cpp/benchmarks/string/factory_benchmark.cu b/cpp/benchmarks/string/factory_benchmark.cu new file mode 100644 index 00000000000..6c5dceffaa8 --- /dev/null +++ b/cpp/benchmarks/string/factory_benchmark.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "string_bench_args.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +#include + +namespace { +using string_pair = thrust::pair; +struct string_view_to_pair { + __device__ string_pair operator()(thrust::pair const& p) + { + return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0}; + } +}; +} // namespace + +class StringsFactory : public cudf::benchmark { +}; + +static void BM_factory(benchmark::State& state) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + auto d_column = cudf::column_device_view::create(table->view().column(0)); + rmm::device_vector pairs(d_column->size()); + thrust::transform(thrust::device, + d_column->pair_begin(), + d_column->pair_end(), + pairs.data(), + string_view_to_pair{}); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + cudf::make_strings_column(pairs); + } + + cudf::strings_column_view input(table->view().column(0)); + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define STRINGS_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(StringsFactory, name) \ + (::benchmark::State & st) { BM_factory(st); } \ + BENCHMARK_REGISTER_F(StringsFactory, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(factory) diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp index f81f859de74..9c709b064dd 100644 --- a/cpp/benchmarks/string/string_bench_args.hpp +++ b/cpp/benchmarks/string/string_bench_args.hpp @@ -17,6 +17,8 @@ #include +#include + /** * @brief Generate row count and row length argument ranges for a string benchmark. * diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 28da8ef4324..988fa552100 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -31,15 +31,60 @@ #include namespace cudf { +namespace strings { +namespace detail { -template -constexpr inline bool is_signed_iterator() +/** + * @brief Returns a new chars column using the specified indices to select + * strings from the input iterator. + * + * This uses a character-parallel gather CUDA kernel that performs very + * well on a strings column with long strings (e.g. average > 64 bytes). + * + * @tparam StringIterator Iterator should produce `string_view` objects. + * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`. + * + * @param strings_begin Start of the iterator to retrieve `string_view` instances + * @param map_begin Start of index iterator. + * @param map_end End of index iterator. + * @param offsets The offset values to be associated with the output chars column. + * @param chars_bytes The total number of bytes for the output chars column. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return New chars column fit for a strings column. + */ +template +std::unique_ptr gather_chars(StringIterator strings_begin, + MapIterator map_begin, + MapIterator map_end, + cudf::device_span const offsets, + size_type chars_bytes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return std::is_signed::value_type>::value; -} + auto const output_count = std::distance(map_begin, map_end); + if (output_count == 0) return make_empty_column(data_type{type_id::INT8}); -namespace strings { -namespace detail { + auto chars_column = create_chars_child_column(output_count, 0, chars_bytes, stream, mr); + auto const d_chars = chars_column->mutable_view().template data(); + + auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char { + auto const out_row = + thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx)); + auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)]; // get row index + auto const d_str = strings_begin[row_idx]; // get row's string + auto const offset = out_idx - *out_row; // get string's char + return d_str.data()[offset]; + }; + + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + d_chars, + gather_chars_fn); + + return chars_column; +} /** * @brief Returns a new strings column using the specified indices to select @@ -107,29 +152,15 @@ std::unique_ptr gather( rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets); // build chars column - size_type const out_chars_bytes = static_cast(total_bytes); - auto out_chars_column = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr); - auto const d_out_chars = out_chars_column->mutable_view().template data(); - - // fill in chars cudf::device_span const d_out_offsets_span(d_out_offsets, output_count + 1); - auto const d_in_chars = (strings_count > 0) ? strings.chars().data() : nullptr; - auto gather_chars_fn = - [d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) { - // find output row index for this output char index - auto const next_row_ptr = thrust::upper_bound( - thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx); - auto const out_row_idx = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1; - auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx]; - auto const in_row_idx = begin[out_row_idx]; - auto const in_char_offset = d_in_offsets[in_row_idx] + str_char_offset; - return d_in_chars[in_char_offset]; - }; - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(out_chars_bytes), - d_out_chars, - gather_chars_fn); + auto const d_strings = column_device_view::create(strings.parent(), stream); + auto out_chars_column = gather_chars(d_strings->begin(), + begin, + end, + d_out_offsets_span, + static_cast(total_bytes), + stream, + mr); return make_strings_column(output_count, std::move(out_offsets_column), diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 8e843c555c5..932f7eb0926 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,7 @@ #include #include +#include #include #include @@ -34,7 +36,27 @@ namespace cudf { namespace strings { namespace detail { -// Create a strings-type column from iterators of pointer/size pairs +/** + * @brief Average string byte-length threshold for deciding character-level + * vs. row-level parallel algorithm. + * + * This value was determined by running the factory_benchmark against different + * string lengths and observing the point where the performance is faster for + * long strings. + */ +constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64; + +/** + * @brief Create a strings-type column from iterators of pointer/size pairs + * + * @tparam IndexPairIterator iterator over type `pair` values + * + * @param begin First string row (inclusive) + * @param end Last string row (exclusive) + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ template std::unique_ptr make_strings_column(IndexPairIterator begin, IndexPairIterator end, @@ -51,7 +73,7 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, auto size_checker = [] __device__(string_index_pair const& item) { return (item.first != nullptr) ? item.second : 0; }; - size_t bytes = thrust::transform_reduce( + size_t const bytes = thrust::transform_reduce( rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus()); CUDF_EXPECTS(bytes < static_cast(std::numeric_limits::max()), "total size of strings is too large for cudf column"); @@ -65,26 +87,49 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); // create null mask - auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; - auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr); - auto null_count = new_nulls.second; + auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; + auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr); + auto const null_count = new_nulls.second; auto null_mask = (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr}; // build chars column - auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); - auto d_chars = chars_column->mutable_view().template data(); - auto copy_chars = [d_chars] __device__(auto item) { - string_index_pair str = thrust::get<0>(item); - size_type offset = thrust::get<1>(item); - if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); - }; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_zip_iterator( - thrust::make_tuple(begin, offsets_column->view().template begin())), - strings_count, - copy_chars); + std::unique_ptr chars_column = [&] { + // use a character-parallel kernel for long string lengths + auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); + if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { + auto const d_offsets = + device_span{offsets_column->view().template data(), + static_cast(offsets_column->size())}; + auto const str_begin = thrust::make_transform_iterator(begin, [] __device__(auto ip) { + return string_view{ip.first, ip.second}; + }); + + return gather_chars(str_begin, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_offsets, + static_cast(bytes), + stream, + mr); + } else { + // this approach is 2-3x faster for a large number of smaller string lengths + auto chars_column = + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); + auto d_chars = chars_column->mutable_view().template data(); + auto copy_chars = [d_chars] __device__(auto item) { + string_index_pair const str = thrust::get<0>(item); + size_type const offset = thrust::get<1>(item); + if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); + }; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_tuple( + begin, offsets_column->view().template begin())), + strings_count, + copy_chars); + return chars_column; + } + }(); return make_strings_column(strings_count, std::move(offsets_column), @@ -95,7 +140,22 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, mr); } -// Create a strings-type column from iterators to chars, offsets, and bitmask. +/** + * @brief Create a strings-type column from iterators to chars, offsets, and bitmask. + * + * @tparam CharIterator iterator over character bytes (int8) + * @tparam OffsetIterator iterator over offset values (size_type) + * + * @param chars_begin First character byte (inclusive) + * @param chars_end Last character byte (exclusive) + * @param offset_begin First offset value (inclusive) + * @param offset_end Last offset value (exclusive) + * @param null_count Number of null rows + * @param null_mask The validity bitmask in Arrow format + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ template std::unique_ptr make_strings_column(CharIterator chars_begin, CharIterator chars_end, diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index e045476ea77..1e0d45d081d 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -224,6 +224,18 @@ constexpr inline bool is_unsigned(data_type type) return cudf::type_dispatcher(type, is_unsigned_impl{}); } +/** + * @brief Indicates whether the `Iterator` value type is unsigned. + * + * @tparam Iterator The type to verify + * @return true if the iterator's value type is unsigned + */ +template +constexpr inline bool is_signed_iterator() +{ + return std::is_signed::value_type>::value; +} + /** * @brief Indicates whether the type `T` is a floating point type. * diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index f904c404251..bd463a7ab0d 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -19,12 +19,18 @@ #include #include #include +#include #include #include #include #include #include +#include + +#include +#include + #include #include @@ -198,3 +204,31 @@ TEST_F(StringsFactoriesTest, CreateOffsets) } } } + +namespace { +using string_pair = thrust::pair; +struct string_view_to_pair { + __device__ string_pair operator()(thrust::pair const& p) + { + return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0}; + } +}; +} // namespace + +TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty) +{ + cudf::test::strings_column_wrapper data( + {"", "this", "is", "", "a", "", "column", "of", "strings", "", ""}, + {0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1}); + + auto d_column = cudf::column_device_view::create(data); + rmm::device_vector pairs(d_column->size()); + thrust::transform(thrust::device, + d_column->pair_begin(), + d_column->pair_end(), + pairs.data(), + string_view_to_pair{}); + + auto result = cudf::make_strings_column(pairs); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data); +}