Skip to content

Commit

Permalink
Benchmarking next_code_point; #576
Browse files Browse the repository at this point in the history
  • Loading branch information
the-moisrex committed Dec 21, 2024
1 parent 0695d6a commit 9c4a925
Show file tree
Hide file tree
Showing 7 changed files with 379 additions and 4 deletions.
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ set(FILE_SOURCES
uri/uri_benchmark.cpp
charset/charset_benchmark.cpp
interleave_bits/interleave_benchmark.cpp
utf_convertion/utf_conversion_benchmark.cpp
)
file(GLOB FILE_PCH *_pch.hpp)

Expand Down
50 changes: 50 additions & 0 deletions benchmarks/common_utils_pch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,56 @@ static StrType str_generator(
return str.substr(0, size);
}

static std::u8string str8_generator(std::size_t size = 10'000) {
std::u8string str;
str.reserve(size);

std::mt19937 generator(std::random_device{}());

for (std::size_t i = 0; i < size; ++i) {
// Generate a random code point
std::uniform_int_distribution<uint32_t> distribution(0, 0x10'FFFF); // Unicode range

char32_t const code_point = distribution(generator);

// Skip invalid or non-characters
if (
(code_point >= 0xD800 && code_point <= 0xDFFF) || (code_point == 0xFFFE) || (code_point == 0xFFFF))
{
--i; // Decrement i to try again
continue;
}

// Convert code point to UTF-8
if (code_point <= 0x7F) { // 1-byte sequence
str += static_cast<char8_t>(code_point);
} else if (code_point <= 0x7FF) { // 2-byte sequence
str += static_cast<char8_t>(0xC0 | (code_point >> 6));
str += static_cast<char8_t>(0x80 | (code_point & 0x3F));
} else if (code_point <= 0xFFFF) { // 3-byte sequence
str += static_cast<char8_t>(0xE0 | (code_point >> 12));
str += static_cast<char8_t>(0x80 | ((code_point >> 6) & 0x3F));
str += static_cast<char8_t>(0x80 | (code_point & 0x3F));
} else { // 4-byte sequence
str += static_cast<char8_t>(0xF0 | (code_point >> 18));
str += static_cast<char8_t>(0x80 | ((code_point >> 12) & 0x3F));
str += static_cast<char8_t>(0x80 | ((code_point >> 6) & 0x3F));
str += static_cast<char8_t>(0x80 | (code_point & 0x3F));
}
}

return str;
}

template <std::size_t count>
static std::array<std::string, count> str8_array_generator(std::size_t size = 10'000) {
std::array<std::string, count> strs;
for (auto& str : strs) {
str = str8_generator(size);
}
return strs;
}

template <std::size_t count>
static std::array<std::string, count> str_array_generator(
std::size_t size = 10'000,
Expand Down
32 changes: 32 additions & 0 deletions benchmarks/utf_convertion/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
flags = -std=c++20 -isystem /usr/local/include -L/usr/local/lib -lpthread -lbenchmark_main -lbenchmark
optflags = -flto -Ofast -DNDEBUG -march=native -mtune=native
files = utf_conversion_benchmark.cpp

all: gcc
.PHONY: all

gcc: $(files)
g++ $(flags) $(optflags) $(files)

clang: $(files)
clang++ $(flags) $(optflags) $(files)

gcc-noopt: $(files)
g++ $(flags) $(files)

clang-noopt: $(files)
clang++ $(flags) $(files)

gcc-profile-generate: $(files)
g++ $(flags) $(optflags) -fprofile-generate $(files)

clang-profile-generate: $(files)
clang++ $(flags) $(optflags) -fprofile-generate $(files)

gcc-profile-use: $(files)
g++ $(flags) $(optflags) -fprofile-use $(files)

clang-profile-use: $(files)
clang++ $(flags) $(optflags) -fprofile-use $(files)


12 changes: 12 additions & 0 deletions benchmarks/utf_convertion/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# UTF conversions benchmark

```
-----------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------
UTFConv_v1 19903 ns 19867 ns 34903
UTFConv_v2 24368 ns 24171 ns 29093
```

The version 2 does more things obviously.

280 changes: 280 additions & 0 deletions benchmarks/utf_convertion/utf_conversion_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
#include "../../webpp/unicode/unicode.hpp"
#include "../benchmark.hpp"
#include "../common_utils_pch.hpp"

// NOLINTBEGIN(*-magic-numbers)
namespace v1 {

using namespace webpp;
using namespace webpp::unicode;
using namespace webpp::unicode::unchecked;

template <stl::bidirectional_iterator Iter = char8_t const*,
stl::bidirectional_iterator EIter = Iter,
UTF32 CodePointType = char32_t>
[[nodiscard]] static constexpr CodePointType next_code_point(Iter& pos, EIter end) noexcept {
using code_point_type = CodePointType;
using char_type = typename stl::iterator_traits<Iter>::value_type;
using unsigned_char_type = stl::make_unsigned_t<char_type>;

if (pos == end) {
return static_cast<code_point_type>(0);
}

// double casting to make sure negative values can't come out of it
auto val = static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
if (pos == end) {
return val;
}
if constexpr (UTF16<char_type>) {
if ((val & 0xFC00U) == 0xD800U) {
// we have two chars
val &= 0x3FFU;
val <<= 10U;
val |= static_cast<code_point_type>(*pos++) & 0x3FFU;
val += 0x1'0000U;
return val;
}
return val; // this is the only char
} else if constexpr (UTF8<char_type>) {
if ((val & 0b1000'0000U) == 0) {
// we have one char
return val;
}
if ((val & 0b1110'0000U) == 0b1100'0000U) {
// we have 2 chars
val &= 0b0001'1111U;
val <<= 6U;
val |= static_cast<code_point_type>(*pos++) & 0b0011'1111U;
return val;
}
if ((val & 0b1111'0000U) == 0b1110'0000U) {
// we have 3 chars
val &= 0b0000'1111U;
val <<= 12U;
val |= (static_cast<code_point_type>(*pos) & 0b0011'1111U) << 6U;
if (++pos == end) {
return *stl::prev(--pos); // bad code point found, return the first code unit
}
val |= static_cast<code_point_type>(*pos++) & 0b0011'1111U;
return val;
}
if ((val & 0b1111'1000U) == 0b1111'0000U) {
// we have 4 chars
val &= 0b0000'0111U;
val <<= 18U;
val |= (static_cast<code_point_type>(*pos) & 0b0011'1111U) << 12U;
if (++pos == end) {
return *stl::prev(--pos); // bad code point found, return the first code unit
}
val |= (static_cast<code_point_type>(*pos) & 0b0011'1111U) << 6U;
if (++pos == end) {
stl::advance(pos, -2);
return *stl::prev(pos); // bad code point found, return the first code unit
}
val |= static_cast<code_point_type>(*pos++) & 0b0011'1111U;
return val;
}
return val; // return this one anyway
} else {
return val;
}
}

std::u32string utf8_to_utf32(std::u8string const& src) {
std::u32string out;
out.reserve(src.length() * 4); // Estimate maximum size of UTF-8 string

auto pos = src.begin();
while (pos != src.end()) {
auto const code_point = next_code_point(pos, src.end());
if (code_point == 0) {
break;
}
out.push_back(code_point);
}
out.resize(pos - src.begin());

return out;
}


} // namespace v1

namespace v2 {
using namespace webpp;
using namespace webpp::unicode;
using namespace webpp::unicode::checked;

template <error_handling ErrorHandling = error_handling::return_unchanged,
UTF32 CodePointType = char32_t,
stl::random_access_iterator Iter = char8_t const*>
[[nodiscard]] static constexpr CodePointType next_code_point(Iter& pos, Iter const& end) noexcept {
using enum error_handling;
using code_point_type = CodePointType;
using iter_traits = stl::iterator_traits<Iter>;
using char_type = typename iter_traits::value_type;
using unsigned_char_type = stl::make_unsigned_t<char_type>;
using difference_type = typename iter_traits::difference_type;

if (pos == end) {
return static_cast<code_point_type>(0); // return \0 if we're at the end already
}

auto const cu1 = static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
auto code_point = cu1;

// we're in a constexpr land, we can't use goto; damn all of you developers who think goto
// is not good enough for you; well, guess what, you're not smart enough to use goto.
for (;;) {
// double casting to make sure negative values can't come out of it
if constexpr (UTF32<char_type>) {
return cu1;
} else if constexpr (UTF16<char_type>) {
if (pos == end) [[unlikely]] {
break;
}
auto const cu2 = static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
bool error = (cu1 & 0xFC00U) != 0xD800U;
error |= (cu2 & 0xFC00U) != 0xDC00U;
// we have two chars
code_point &= 0x3FFU;
code_point <<= 10U;
code_point |= cu2 & 0x3FFU;
code_point += 0x1'0000U;
if (error) [[unlikely]] {
--pos;
code_point = cu1;
break;
}
return code_point;
} else if constexpr (UTF8<char_type>) {
auto const len = required_length_of<char_type, difference_type>(cu1);
if (end - pos < len - 1) [[unlikely]] {
break;
}
switch (len) {
case 1:
if ((cu1 & 0b1000'0000U) != 0) [[unlikely]] {
break;
}
return cu1;
case 2: {
auto const cu2 =
static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
bool error = (cu1 & 0b1110'0000U) != 0b1100'0000U;
error |= (cu2 & 0b1100'0000U) != 0b1000'0000U;
code_point &= 0b0001'1111U;
code_point <<= 6U;
code_point |= cu2 & 0b0011'1111U;
if (error || code_point < 0x80 || 0x7ff < code_point) [[unlikely]] {
--pos;
code_point = cu1;
break;
}
return code_point;
}
case 3: {
auto const cu2 =
static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
auto const cu3 =
static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
bool error = (cu1 & 0b1111'0000U) != 0b1110'0000U;
error |= (cu2 & 0b1100'0000U) != 0b1000'0000U;
error |= (cu3 & 0b1100'0000U) != 0b1000'0000U;
code_point &= 0b0000'1111U;
code_point <<= 12U;
code_point |= (cu2 & 0b0011'1111U) << 6U;
code_point |= cu3 & 0b0011'1111U;
if (error || code_point < 0x800U || 0xFFFFU < code_point ||
(0xD7FFU < code_point && code_point < 0xE000U)) [[unlikely]]
{
stl::advance(pos, -2);
code_point = cu1;
break;
}
return code_point;
}
case 4: {
auto const cu2 =
static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
auto const cu3 =
static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
auto const cu4 =
static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
bool error = (cu1 & 0b1111'0000) != 0b1111'0000;
error |= (cu2 & 0b1100'0000U) != 0b1000'0000U;
error |= (cu3 & 0b1100'0000U) != 0b1000'0000U;
error |= (cu4 & 0b1100'0000U) != 0b1000'0000U;
code_point &= 0b0000'0111U;
code_point <<= 18U;
code_point |= (cu2 & 0b0011'1111U) << 12U;
code_point |= (cu3 & 0b0011'1111U) << 6U;
code_point |= cu4 & 0b0011'1111U;
if (error || code_point <= 0xFFFFU || 0x10'FFFFU < code_point) [[unlikely]] {
stl::advance(pos, -3);
code_point = cu1;
break;
}
return code_point;
}
default: break;
}
break;

} else {
static_assert_false(char_type, "Invalid code unit type.");
return cu1;
}
}

// handle errors:
if constexpr (ErrorHandling == return_replacement_char) {
return replacement_char<code_point_type>;
} else if constexpr (ErrorHandling == return_negated_char) {
return -code_point;
} else {
return code_point;
}
}

std::u32string utf8_to_utf32(std::u8string const& src) {
std::u32string out;
out.reserve(src.length() * 4); // Estimate maximum size of UTF-8 string

auto pos = src.begin();
while (pos != src.end()) {
auto const code_point = next_code_point(pos, src.end());
if (code_point == 0) {
break;
}
out.push_back(code_point);
}
out.resize(pos - src.begin());

return out;
}

} // namespace v2

auto const str = str8_generator(10'000);

static void UTFConv_v1(benchmark::State& state) {
for ([[maybe_unused]] auto _ : state) {
auto res = v1::utf8_to_utf32(str);
benchmark::DoNotOptimize(res);
}
}

BENCHMARK(UTFConv_v1);

static void UTFConv_v2(benchmark::State& state) {
for ([[maybe_unused]] auto _ : state) {
auto res = v2::utf8_to_utf32(str);
benchmark::DoNotOptimize(res);
}
}

BENCHMARK(UTFConv_v2);

// NOLINTEND(*-magic-numbers)
Loading

0 comments on commit 9c4a925

Please sign in to comment.