diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 0bf92ff54bb..7b50989779d 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -173,8 +173,11 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp) ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu) ################################################################################################### -# - subword tokenizer benchmark ------------------------------------------------------------------- -ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp) +# - nvtext benchmark ------------------------------------------------------------------- +ConfigureBench(TEXT_BENCH + text/normalize_benchmark.cpp + text/normalize_spaces_benchmark.cpp + text/subword_benchmark.cpp) ################################################################################################### # - strings benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/text/normalize_benchmark.cpp b/cpp/benchmarks/text/normalize_benchmark.cpp new file mode 100644 index 00000000000..32c4fb7dcde --- /dev/null +++ b/cpp/benchmarks/text/normalize_benchmark.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +class TextNormalize : public cudf::benchmark { +}; + +static void BM_normalize(benchmark::State& state, bool to_lower) +{ + auto const n_rows = static_cast(state.range(0)); + auto const max_str_length = static_cast(state.range(1)); + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + nvtext::normalize_characters(input, to_lower); + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { + for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { + // avoid generating combinations that exceed the cudf column limit + size_t total_chars = static_cast(row_count) * rowlen * 4; + if (total_chars < std::numeric_limits::max()) { + b->Args({row_count, rowlen}); + } + } + } +} + +#define NVTEXT_BENCHMARK_DEFINE(name, lower) \ + BENCHMARK_DEFINE_F(TextNormalize, name) \ + (::benchmark::State & st) { BM_normalize(st, lower); } \ + BENCHMARK_REGISTER_F(TextNormalize, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +NVTEXT_BENCHMARK_DEFINE(characters, false) +NVTEXT_BENCHMARK_DEFINE(to_lower, true) diff --git a/cpp/benchmarks/text/normalize_spaces_benchmark.cpp b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp new file mode 100644 index 00000000000..dcabb0c225c --- /dev/null +++ b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +class TextNormalize : public cudf::benchmark { +}; + +static void BM_normalize(benchmark::State& state) +{ + auto const n_rows = static_cast(state.range(0)); + auto const max_str_length = static_cast(state.range(1)); + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + nvtext::normalize_spaces(input); + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define NVTEXT_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(TextNormalize, name) \ + (::benchmark::State & st) { BM_normalize(st); } \ + BENCHMARK_REGISTER_F(TextNormalize, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +NVTEXT_BENCHMARK_DEFINE(spaces) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 6ebe529b56e..e3a43ac25c0 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -14,6 +14,12 @@ * limitations under the License. */ +#include +#include +#include + +#include + #include #include #include @@ -22,17 +28,11 @@ #include #include #include +#include #include #include #include -#include - -#include - -#include -#include - #include #include @@ -54,32 +54,39 @@ namespace { */ struct normalize_spaces_fn { cudf::column_device_view const d_strings; // strings to normalize - int32_t const* d_offsets{}; // offsets into d_buffer - char* d_buffer{}; // output buffer for characters + int32_t* d_offsets{}; // offsets into d_buffer + char* d_chars{}; // output buffer for characters - __device__ int32_t operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) { - if (d_strings.is_null(idx)) return 0; - cudf::string_view single_space(" ", 1); + if (d_strings.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + cudf::string_view const single_space(" ", 1); auto const d_str = d_strings.element(idx); - char* buffer = d_offsets ? d_buffer + d_offsets[idx] : nullptr; + char* buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; char* optr = buffer; // running output pointer int32_t nbytes = 0; // holds the number of bytes per output string - // create tokenizer for this string with whitespace delimiter (default) + + // create a tokenizer for this string with whitespace delimiter (default) characters_tokenizer tokenizer(d_str); + // this will retrieve tokens automatically skipping runs of whitespace while (tokenizer.next_token()) { - auto token_pos = tokenizer.token_byte_positions(); + auto const token_pos = tokenizer.token_byte_positions(); nbytes += token_pos.second - token_pos.first + 1; // token size plus a single space if (optr) { - cudf::string_view token(d_str.data() + token_pos.first, token_pos.second - token_pos.first); + cudf::string_view const token(d_str.data() + token_pos.first, + token_pos.second - token_pos.first); if (optr != buffer) // prepend space unless we are at the beginning optr = cudf::strings::detail::copy_string(optr, single_space); // write token to output buffer - optr = cudf::strings::detail::copy_string(optr, token); // copy token to output + optr = cudf::strings::detail::copy_string(optr, token); } } - return (nbytes > 0) ? nbytes - 1 : 0; // remove trailing space + // remove trailing space + if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0; } }; @@ -95,7 +102,7 @@ struct codepoint_to_utf8_fn { cudf::column_device_view const d_strings; // input strings uint32_t const* cp_data; // full code-point array int32_t const* d_cp_offsets{}; // offsets to each string's code-point array - int32_t const* d_offsets{}; // offsets for the output strings + int32_t* d_offsets{}; // offsets for the output strings char* d_chars{}; // buffer for the output strings column /** @@ -105,7 +112,7 @@ struct codepoint_to_utf8_fn { * @param count number of code-points in `str_cps` * @return Number of bytes required for the output */ - __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count) + __device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count) { return thrust::transform_reduce( thrust::seq, @@ -113,17 +120,23 @@ struct codepoint_to_utf8_fn { str_cps + count, [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); }, 0, - thrust::plus()); + thrust::plus()); } - __device__ cudf::size_type operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) { - if (d_strings.is_null(idx)) return 0; + if (d_strings.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } auto const d_str = d_strings.element(idx); auto const offset = d_cp_offsets[idx]; auto const count = d_cp_offsets[idx + 1] - offset; // number of code-points auto str_cps = cp_data + offset; // code-points for this string - if (!d_chars) return compute_output_size(str_cps, count); + if (!d_chars) { + d_offsets[idx] = compute_output_size(str_cps, count); + return; + } // convert each code-point to 1-4 UTF-8 encoded bytes char* out_ptr = d_chars + d_offsets[idx]; for (uint32_t jdx = 0; jdx < count; ++jdx) { @@ -149,7 +162,6 @@ struct codepoint_to_utf8_fn { *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); } } - return 0; } }; @@ -161,40 +173,20 @@ std::unique_ptr normalize_spaces( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - cudf::size_type strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); // create device column - auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - // copy bitmask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - - // create offsets by calculating size of each string for output - auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( - 0, normalize_spaces_fn{d_strings}); // this does size-only calc - auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); - - // build the chars column - cudf::size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = cudf::strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); + auto d_strings = cudf::column_device_view::create(strings.parent(), stream); - // copy tokens to the chars buffer - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - normalize_spaces_fn{d_strings, d_offsets, d_chars}); - chars_column->set_null_count(0); // reset null count for child column + // build offsets and children using the normalize_space_fn + auto children = cudf::strings::detail::make_strings_children( + normalize_spaces_fn{*d_strings}, strings.size(), strings.null_count(), stream, mr); - return cudf::make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + return cudf::make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); } @@ -207,8 +199,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); // create the normalizer and call it data_normalizer normalizer(stream, do_lower_case); @@ -229,33 +220,20 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con // - the cp_offsets identify which code-points go with which strings uint32_t const* cp_chars = result.first->data(); int32_t const* cp_offsets = reinterpret_cast(result.second->data()); - auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - - // build the output offsets column: compute the output size of each string - auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( - 0, codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets}); - auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); - // create the output chars column - cudf::size_type output_bytes = - cudf::detail::get_value(offsets_column->view(), strings_count, stream); - auto chars_column = cudf::strings::detail::create_chars_child_column( - strings_count, strings.null_count(), output_bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); + auto d_strings = cudf::column_device_view::create(strings.parent(), stream); - // build the chars output data: convert the 4-byte code-point values into UTF-8 chars - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets, d_offsets, d_chars}); - chars_column->set_null_count(0); // reset null count for child column + // build offsets and children using the codepoint_to_utf8_fn + auto children = cudf::strings::detail::make_strings_children( + codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, + strings.size(), + strings.null_count(), + stream, + mr); - return cudf::make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + return cudf::make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream,