Add gbenchmark for nvtext normalize functions (#7668)

Reference #5696 Creates a gbenchmark for `nvtext::normalize_spaces()` and `nvtext::normalize_characters()` functions. The benchmarks measures various string lengths and number of rows. I found that `normalize_spaces()` is used in haproxy parsing along with `extract` so having this benchmark helps measure possible performance improvement solutions there. The `normalize_characters` is the same code used as part of the `subword_tokenizer`. Since each requires different memory footprint my initial goal for them to share a common benchmark structure did not work out. So the 2 tests are separate gbenchmark test files. I refactored some of this code to use the more efficient `make_strings_children` and this improved the performance of `normalize_spaces` by 2-3x. The current subword-tokenizer gbenchmark is also incorporated into the the TEXT_BENCHMARK gbenchmark. Authors: - David (@davidwendt) Approvers: - Vukasin Milovanovic (@vuule) - Conor Hoekstra (@codereport) - Mark Harris (@harrism) URL: #7668
rapidsai · Mar 23, 2021 · e0056ed · e0056ed
1 parent 0f8035d
commit e0056ed
Show file tree

Hide file tree

Showing 4 changed files with 213 additions and 82 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -173,8 +173,11 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
 
 ###################################################################################################
-# - subword tokenizer benchmark -------------------------------------------------------------------
-ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp)
+# - nvtext benchmark -------------------------------------------------------------------
+ConfigureBench(TEXT_BENCH
+  text/normalize_benchmark.cpp
+  text/normalize_spaces_benchmark.cpp
+  text/subword_benchmark.cpp)
 
 ###################################################################################################
 # - strings benchmark -------------------------------------------------------------------

diff --git a/cpp/benchmarks/text/normalize_benchmark.cpp b/cpp/benchmarks/text/normalize_benchmark.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/normalize.hpp>
+
+class TextNormalize : public cudf::benchmark {
+};
+
+static void BM_normalize(benchmark::State& state, bool to_lower)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    nvtext::normalize_characters(input, to_lower);
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
+    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
+      // avoid generating combinations that exceed the cudf column limit
+      size_t total_chars = static_cast<size_t>(row_count) * rowlen * 4;
+      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+        b->Args({row_count, rowlen});
+      }
+    }
+  }
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name, lower)             \
+  BENCHMARK_DEFINE_F(TextNormalize, name)                \
+  (::benchmark::State & st) { BM_normalize(st, lower); } \
+  BENCHMARK_REGISTER_F(TextNormalize, name)              \
+    ->Apply(generate_bench_args)                         \
+    ->UseManualTime()                                    \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(characters, false)
+NVTEXT_BENCHMARK_DEFINE(to_lower, true)
diff --git a/cpp/benchmarks/text/normalize_spaces_benchmark.cpp b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/normalize.hpp>
+
+class TextNormalize : public cudf::benchmark {
+};
+
+static void BM_normalize(benchmark::State& state)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    nvtext::normalize_spaces(input);
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)             \
+  BENCHMARK_DEFINE_F(TextNormalize, name)         \
+  (::benchmark::State & st) { BM_normalize(st); } \
+  BENCHMARK_REGISTER_F(TextNormalize, name)       \
+    ->Apply(generate_bench_args)                  \
+    ->UseManualTime()                             \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(spaces)