diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 0bf92ff54bb..7b50989779d 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -173,8 +173,11 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
 
 ###################################################################################################
-# - subword tokenizer benchmark -------------------------------------------------------------------
-ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp)
+# - nvtext benchmark -------------------------------------------------------------------
+ConfigureBench(TEXT_BENCH
+  text/normalize_benchmark.cpp
+  text/normalize_spaces_benchmark.cpp
+  text/subword_benchmark.cpp)
 
 ###################################################################################################
 # - strings benchmark -------------------------------------------------------------------
diff --git a/cpp/benchmarks/text/normalize_benchmark.cpp b/cpp/benchmarks/text/normalize_benchmark.cpp
new file mode 100644
index 00000000000..32c4fb7dcde
--- /dev/null
+++ b/cpp/benchmarks/text/normalize_benchmark.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/normalize.hpp>
+
+class TextNormalize : public cudf::benchmark {
+};
+
+static void BM_normalize(benchmark::State& state, bool to_lower)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    nvtext::normalize_characters(input, to_lower);
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
+    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
+      // avoid generating combinations that exceed the cudf column limit
+      size_t total_chars = static_cast<size_t>(row_count) * rowlen * 4;
+      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+        b->Args({row_count, rowlen});
+      }
+    }
+  }
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name, lower)             \
+  BENCHMARK_DEFINE_F(TextNormalize, name)                \
+  (::benchmark::State & st) { BM_normalize(st, lower); } \
+  BENCHMARK_REGISTER_F(TextNormalize, name)              \
+    ->Apply(generate_bench_args)                         \
+    ->UseManualTime()                                    \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(characters, false)
+NVTEXT_BENCHMARK_DEFINE(to_lower, true)
diff --git a/cpp/benchmarks/text/normalize_spaces_benchmark.cpp b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp
new file mode 100644
index 00000000000..dcabb0c225c
--- /dev/null
+++ b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/normalize.hpp>
+
+class TextNormalize : public cudf::benchmark {
+};
+
+static void BM_normalize(benchmark::State& state)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    nvtext::normalize_spaces(input);
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)             \
+  BENCHMARK_DEFINE_F(TextNormalize, name)         \
+  (::benchmark::State & st) { BM_normalize(st); } \
+  BENCHMARK_REGISTER_F(TextNormalize, name)       \
+    ->Apply(generate_bench_args)                  \
+    ->UseManualTime()                             \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(spaces)
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 6ebe529b56e..e3a43ac25c0 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#include <nvtext/normalize.hpp>
+#include <text/subword/detail/data_normalizer.hpp>
+#include <text/utilities/tokenize_ops.cuh>
+
+#include <strings/utilities.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -22,17 +28,11 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <strings/utilities.cuh>
-
-#include <nvtext/normalize.hpp>
-
-#include <text/subword/detail/data_normalizer.hpp>
-#include <text/utilities/tokenize_ops.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
@@ -54,32 +54,39 @@ namespace {
  */
 struct normalize_spaces_fn {
   cudf::column_device_view const d_strings;  // strings to normalize
-  int32_t const* d_offsets{};                // offsets into d_buffer
-  char* d_buffer{};                          // output buffer for characters
+  int32_t* d_offsets{};                      // offsets into d_buffer
+  char* d_chars{};                           // output buffer for characters
 
-  __device__ int32_t operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
-    cudf::string_view single_space(" ", 1);
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    cudf::string_view const single_space(" ", 1);
     auto const d_str = d_strings.element<cudf::string_view>(idx);
-    char* buffer     = d_offsets ? d_buffer + d_offsets[idx] : nullptr;
+    char* buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
     char* optr       = buffer;  // running output pointer
     int32_t nbytes   = 0;       // holds the number of bytes per output string
-    // create tokenizer for this string with whitespace delimiter (default)
+
+    // create a tokenizer for this string with whitespace delimiter (default)
     characters_tokenizer tokenizer(d_str);
+
     // this will retrieve tokens automatically skipping runs of whitespace
     while (tokenizer.next_token()) {
-      auto token_pos = tokenizer.token_byte_positions();
+      auto const token_pos = tokenizer.token_byte_positions();
       nbytes += token_pos.second - token_pos.first + 1;  // token size plus a single space
       if (optr) {
-        cudf::string_view token(d_str.data() + token_pos.first, token_pos.second - token_pos.first);
+        cudf::string_view const token(d_str.data() + token_pos.first,
+                                      token_pos.second - token_pos.first);
         if (optr != buffer)  // prepend space unless we are at the beginning
           optr = cudf::strings::detail::copy_string(optr, single_space);
         // write token to output buffer
-        optr = cudf::strings::detail::copy_string(optr, token);  // copy token to output
+        optr = cudf::strings::detail::copy_string(optr, token);
       }
     }
-    return (nbytes > 0) ? nbytes - 1 : 0;  // remove trailing space
+    // remove trailing space
+    if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
   }
 };
 
@@ -95,7 +102,7 @@ struct codepoint_to_utf8_fn {
   cudf::column_device_view const d_strings;  // input strings
   uint32_t const* cp_data;                   // full code-point array
   int32_t const* d_cp_offsets{};             // offsets to each string's code-point array
-  int32_t const* d_offsets{};                // offsets for the output strings
+  int32_t* d_offsets{};                      // offsets for the output strings
   char* d_chars{};                           // buffer for the output strings column
 
   /**
@@ -105,7 +112,7 @@ struct codepoint_to_utf8_fn {
    * @param count number of code-points in `str_cps`
    * @return Number of bytes required for the output
    */
-  __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count)
+  __device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count)
   {
     return thrust::transform_reduce(
       thrust::seq,
@@ -113,17 +120,23 @@ struct codepoint_to_utf8_fn {
       str_cps + count,
       [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); },
       0,
-      thrust::plus<cudf::size_type>());
+      thrust::plus<int32_t>());
   }
 
-  __device__ cudf::size_type operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     auto const d_str  = d_strings.element<cudf::string_view>(idx);
     auto const offset = d_cp_offsets[idx];
     auto const count  = d_cp_offsets[idx + 1] - offset;  // number of code-points
     auto str_cps      = cp_data + offset;                // code-points for this string
-    if (!d_chars) return compute_output_size(str_cps, count);
+    if (!d_chars) {
+      d_offsets[idx] = compute_output_size(str_cps, count);
+      return;
+    }
     // convert each code-point to 1-4 UTF-8 encoded bytes
     char* out_ptr = d_chars + d_offsets[idx];
     for (uint32_t jdx = 0; jdx < count; ++jdx) {
@@ -149,7 +162,6 @@ struct codepoint_to_utf8_fn {
         *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
       }
     }
-    return 0;
   }
 };
 
@@ -161,40 +173,20 @@ std::unique_ptr<cudf::column> normalize_spaces(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  cudf::size_type strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create device column
-  auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-  // copy bitmask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-
-  // create offsets by calculating size of each string for output
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
-    0, normalize_spaces_fn{d_strings});  // this does size-only calc
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
-
-  // build the chars column
-  cudf::size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column     = cudf::strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
-  // copy tokens to the chars buffer
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     normalize_spaces_fn{d_strings, d_offsets, d_chars});
-  chars_column->set_null_count(0);  // reset null count for child column
+  // build offsets and children using the normalize_space_fn
+  auto children = cudf::strings::detail::make_strings_children(
+    normalize_spaces_fn{*d_strings}, strings.size(), strings.null_count(), stream, mr);
 
-  return cudf::make_strings_column(strings_count,
-                                   std::move(offsets_column),
-                                   std::move(chars_column),
+  return cudf::make_strings_column(strings.size(),
+                                   std::move(children.first),
+                                   std::move(children.second),
                                    strings.null_count(),
-                                   std::move(null_mask),
+                                   cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                    stream,
                                    mr);
 }
@@ -207,8 +199,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
                                                    rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
-  auto const strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create the normalizer and call it
   data_normalizer normalizer(stream, do_lower_case);
@@ -229,33 +220,20 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   // - the cp_offsets identify which code-points go with which strings
   uint32_t const* cp_chars  = result.first->data();
   int32_t const* cp_offsets = reinterpret_cast<int32_t const*>(result.second->data());
-  auto strings_column       = cudf::column_device_view::create(strings.parent(), stream);
-
-  // build the output offsets column: compute the output size of each string
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
-    0, codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
 
-  // create the output chars column
-  cudf::size_type output_bytes =
-    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column = cudf::strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), output_bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
-  // build the chars output data: convert the 4-byte code-point values into UTF-8 chars
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    strings_count,
-    codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets, d_offsets, d_chars});
-  chars_column->set_null_count(0);  // reset null count for child column
+  // build offsets and children using the codepoint_to_utf8_fn
+  auto children = cudf::strings::detail::make_strings_children(
+    codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets},
+    strings.size(),
+    strings.null_count(),
+    stream,
+    mr);
 
-  return cudf::make_strings_column(strings_count,
-                                   std::move(offsets_column),
-                                   std::move(chars_column),
+  return cudf::make_strings_column(strings.size(),
+                                   std::move(children.first),
+                                   std::move(children.second),
                                    strings.null_count(),
                                    cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                    stream,