Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gbenchmark for cudf::strings::translate function #7617

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,5 @@ ConfigureBench(STRINGS_BENCH
string/find_benchmark.cpp
string/replace_benchmark.cpp
string/split_benchmark.cpp
string/translate_benchmark.cpp
string/url_decode_benchmark.cpp)
2 changes: 2 additions & 0 deletions cpp/benchmarks/string/string_bench_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#include <benchmark/benchmark.h>

#include <cudf/types.hpp>

/**
* @brief Generate row count and row length argument ranges for a string benchmark.
*
Expand Down
86 changes: 86 additions & 0 deletions cpp/benchmarks/string/translate_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/translate.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <algorithm>
#include <random>
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

#include <thrust/iterator/counting_iterator.h>

class StringTranslate : public cudf::benchmark {
};

using entry_type = std::pair<cudf::char_utf8, cudf::char_utf8>;

static void BM_translate(benchmark::State& state, int entry_count)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

std::vector<entry_type> entries(entry_count);
std::transform(thrust::counting_iterator<int>(0),
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
thrust::counting_iterator<int>(entry_count),
entries.begin(),
[](auto idx) -> entry_type {
return entry_type{'!' + idx, '~' - idx};
});

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
cudf::strings::translate(input, entries);
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name, entries) \
BENCHMARK_DEFINE_F(StringTranslate, name) \
(::benchmark::State & st) { BM_translate(st, entries); } \
BENCHMARK_REGISTER_F(StringTranslate, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(translate_small, 5)
STRINGS_BENCHMARK_DEFINE(translate_medium, 25)
STRINGS_BENCHMARK_DEFINE(translate_large, 50)
94 changes: 48 additions & 46 deletions cpp/src/strings/translate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/utilities.hpp>
Expand All @@ -30,7 +29,8 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/find.h>
#include <thrust/binary_search.h>
#include <thrust/sort.h>

#include <algorithm>

Expand All @@ -46,31 +46,37 @@ namespace {
*/
struct translate_fn {
column_device_view const d_strings;
rmm::device_vector<translate_table>::iterator table_begin;
rmm::device_vector<translate_table>::iterator table_end;
int32_t const* d_offsets{};
rmm::device_uvector<translate_table>::iterator table_begin;
rmm::device_uvector<translate_table>::iterator table_end;
int32_t* d_offsets{};
char* d_chars{};

__device__ size_type operator()(size_type idx)
__device__ void operator()(size_type idx)
{
if (d_strings.is_null(idx)) return 0;
string_view d_str = d_strings.element<string_view>(idx);
size_type bytes = d_str.size_bytes();
char* out_ptr = d_offsets ? d_chars + d_offsets[idx] : nullptr;
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
return;
}
string_view const d_str = d_strings.element<string_view>(idx);

size_type bytes = d_str.size_bytes();
char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
for (auto chr : d_str) {
auto entry =
thrust::find_if(thrust::seq, table_begin, table_end, [chr] __device__(auto const& te) {
return te.first == chr;
});
if (entry != table_end) {
auto const entry =
thrust::lower_bound(thrust::seq,
table_begin,
table_end,
translate_table{chr, 0},
[](auto const& lhs, auto const& rhs) { return lhs.first < rhs.first; });
if (entry != table_end && entry->first == chr) {
bytes -= bytes_in_char_utf8(chr);
chr = static_cast<translate_table>(*entry).second;
chr = entry->second;
if (chr) // if null, skip the character
bytes += bytes_in_char_utf8(chr);
}
if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr);
}
return bytes;
if (!d_chars) d_offsets[idx] = bytes;
}
};

Expand All @@ -83,44 +89,40 @@ std::unique_ptr<column> translate(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
size_type strings_count = strings.size();
if (strings_count == 0) return make_empty_strings_column(stream, mr);
if (strings.is_empty()) return make_empty_strings_column(stream, mr);

size_type table_size = static_cast<size_type>(chars_table.size());
// convert input table
thrust::host_vector<translate_table> htable(table_size);
std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) {
return translate_table{entry.first, entry.second};
});
// The size of this table is usually much less than 100 so it is was
// found to be more efficient to sort on the CPU than the GPU.
thrust::sort(htable.begin(), htable.end(), [](auto const& lhs, auto const& rhs) {
return lhs.first < rhs.first;
});
// copy translate table to device memory
rmm::device_vector<translate_table> table(htable);

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;
// create null mask
rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
// create offsets column
auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
0, translate_fn{d_strings, table.begin(), table.end()});
auto offsets_column = make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto d_offsets = offsets_column->view().data<int32_t>();

// build chars column
size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
auto chars_column = strings::detail::create_chars_child_column(
strings_count, strings.null_count(), bytes, stream, mr);
auto d_chars = chars_column->mutable_view().data<char>();
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
strings_count,
translate_fn{d_strings, table.begin(), table.end(), d_offsets, d_chars});

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
rmm::device_uvector<translate_table> table(htable.size(), stream);
CUDA_TRY(cudaMemcpyAsync(table.data(),
htable.data(),
sizeof(translate_table) * htable.size(),
cudaMemcpyHostToDevice,
stream.value()));

auto d_strings = column_device_view::create(strings.parent(), stream);

auto children = make_strings_children(translate_fn{*d_strings, table.begin(), table.end()},
strings.size(),
strings.null_count(),
stream,
mr);

return make_strings_column(strings.size(),
std::move(children.first),
std::move(children.second),
strings.null_count(),
std::move(null_mask),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
stream,
mr);
}
Expand Down