diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 682f1ac5fca..a8e33f0bf5b 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -191,4 +191,5 @@ ConfigureBench(STRINGS_BENCH string/replace_benchmark.cpp string/split_benchmark.cpp string/substring_benchmark.cpp + string/translate_benchmark.cpp string/url_decode_benchmark.cpp) diff --git a/cpp/benchmarks/string/translate_benchmark.cpp b/cpp/benchmarks/string/translate_benchmark.cpp new file mode 100644 index 00000000000..c49a986d744 --- /dev/null +++ b/cpp/benchmarks/string/translate_benchmark.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "string_bench_args.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +class StringTranslate : public cudf::benchmark { +}; + +using entry_type = std::pair; + +static void BM_translate(benchmark::State& state, int entry_count) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + std::vector entries(entry_count); + std::transform(thrust::counting_iterator(0), + thrust::counting_iterator(entry_count), + entries.begin(), + [](auto idx) -> entry_type { + return entry_type{'!' + idx, '~' - idx}; + }); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + cudf::strings::translate(input, entries); + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define STRINGS_BENCHMARK_DEFINE(name, entries) \ + BENCHMARK_DEFINE_F(StringTranslate, name) \ + (::benchmark::State & st) { BM_translate(st, entries); } \ + BENCHMARK_REGISTER_F(StringTranslate, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(translate_small, 5) +STRINGS_BENCHMARK_DEFINE(translate_medium, 25) +STRINGS_BENCHMARK_DEFINE(translate_large, 50) diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 08af1d76d22..138fe3fa508 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -30,7 +29,8 @@ #include #include -#include +#include +#include #include @@ -46,31 +46,37 @@ namespace { */ struct translate_fn { column_device_view const d_strings; - rmm::device_vector::iterator table_begin; - rmm::device_vector::iterator table_end; - int32_t const* d_offsets{}; + rmm::device_uvector::iterator table_begin; + rmm::device_uvector::iterator table_end; + int32_t* d_offsets{}; char* d_chars{}; - __device__ size_type operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_strings.is_null(idx)) return 0; - string_view d_str = d_strings.element(idx); - size_type bytes = d_str.size_bytes(); - char* out_ptr = d_offsets ? d_chars + d_offsets[idx] : nullptr; + if (d_strings.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + string_view const d_str = d_strings.element(idx); + + size_type bytes = d_str.size_bytes(); + char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; for (auto chr : d_str) { - auto entry = - thrust::find_if(thrust::seq, table_begin, table_end, [chr] __device__(auto const& te) { - return te.first == chr; - }); - if (entry != table_end) { + auto const entry = + thrust::lower_bound(thrust::seq, + table_begin, + table_end, + translate_table{chr, 0}, + [](auto const& lhs, auto const& rhs) { return lhs.first < rhs.first; }); + if (entry != table_end && entry->first == chr) { bytes -= bytes_in_char_utf8(chr); - chr = static_cast(*entry).second; + chr = entry->second; if (chr) // if null, skip the character bytes += bytes_in_char_utf8(chr); } if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr); } - return bytes; + if (!d_chars) d_offsets[idx] = bytes; } }; @@ -83,8 +89,7 @@ std::unique_ptr translate( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(stream, mr); + if (strings.is_empty()) return make_empty_strings_column(stream, mr); size_type table_size = static_cast(chars_table.size()); // convert input table @@ -92,35 +97,32 @@ std::unique_ptr translate( std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) { return translate_table{entry.first, entry.second}; }); + // The size of this table is usually much less than 100 so it is was + // found to be more efficient to sort on the CPU than the GPU. + thrust::sort(htable.begin(), htable.end(), [](auto const& lhs, auto const& rhs) { + return lhs.first < rhs.first; + }); // copy translate table to device memory - rmm::device_vector table(htable); - - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - // create null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - // create offsets column - auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( - 0, translate_fn{d_strings, table.begin(), table.end()}); - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); - - // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - translate_fn{d_strings, table.begin(), table.end(), d_offsets, d_chars}); - - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + rmm::device_uvector table(htable.size(), stream); + CUDA_TRY(cudaMemcpyAsync(table.data(), + htable.data(), + sizeof(translate_table) * htable.size(), + cudaMemcpyHostToDevice, + stream.value())); + + auto d_strings = column_device_view::create(strings.parent(), stream); + + auto children = make_strings_children(translate_fn{*d_strings, table.begin(), table.end()}, + strings.size(), + strings.null_count(), + stream, + mr); + + return make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); }