Skip to content

Commit

Permalink
Add gbenchmark for nvtext normalize functions (#7668)
Browse files Browse the repository at this point in the history
Reference #5696
Creates a gbenchmark for `nvtext::normalize_spaces()` and  `nvtext::normalize_characters()` functions.
The benchmarks measures various string lengths and number of rows.
I found that `normalize_spaces()` is used in haproxy parsing along with `extract` so having this benchmark helps measure possible performance improvement solutions there.
The `normalize_characters` is the same code used as part of the `subword_tokenizer`. 

Since each requires different memory footprint my initial goal for them to share a common benchmark structure did not work out. So the 2 tests are separate gbenchmark test files.

I refactored some of this code to use the more efficient `make_strings_children` and this improved the performance of `normalize_spaces` by 2-3x.

The current subword-tokenizer gbenchmark is also incorporated into the the TEXT_BENCHMARK gbenchmark.

Authors:
  - David (@davidwendt)

Approvers:
  - Vukasin Milovanovic (@vuule)
  - Conor Hoekstra (@codereport)
  - Mark Harris (@harrism)

URL: #7668
  • Loading branch information
davidwendt authored Mar 23, 2021
1 parent 0f8035d commit e0056ed
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 82 deletions.
7 changes: 5 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,11 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)

###################################################################################################
# - subword tokenizer benchmark -------------------------------------------------------------------
ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp)
# - nvtext benchmark -------------------------------------------------------------------
ConfigureBench(TEXT_BENCH
text/normalize_benchmark.cpp
text/normalize_spaces_benchmark.cpp
text/subword_benchmark.cpp)

###################################################################################################
# - strings benchmark -------------------------------------------------------------------
Expand Down
79 changes: 79 additions & 0 deletions cpp/benchmarks/text/normalize_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <nvtext/normalize.hpp>

class TextNormalize : public cudf::benchmark {
};

static void BM_normalize(benchmark::State& state, bool to_lower)
{
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
nvtext::normalize_characters(input, to_lower);
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
// avoid generating combinations that exceed the cudf column limit
size_t total_chars = static_cast<size_t>(row_count) * rowlen * 4;
if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
b->Args({row_count, rowlen});
}
}
}
}

#define NVTEXT_BENCHMARK_DEFINE(name, lower) \
BENCHMARK_DEFINE_F(TextNormalize, name) \
(::benchmark::State & st) { BM_normalize(st, lower); } \
BENCHMARK_REGISTER_F(TextNormalize, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

NVTEXT_BENCHMARK_DEFINE(characters, false)
NVTEXT_BENCHMARK_DEFINE(to_lower, true)
71 changes: 71 additions & 0 deletions cpp/benchmarks/text/normalize_spaces_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/string/string_bench_args.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <nvtext/normalize.hpp>

class TextNormalize : public cudf::benchmark {
};

static void BM_normalize(benchmark::State& state)
{
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
nvtext::normalize_spaces(input);
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define NVTEXT_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(TextNormalize, name) \
(::benchmark::State & st) { BM_normalize(st); } \
BENCHMARK_REGISTER_F(TextNormalize, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

NVTEXT_BENCHMARK_DEFINE(spaces)
Loading

0 comments on commit e0056ed

Please sign in to comment.