From 9c4a92578624df18b32c3764eb999b4ce1c6036c Mon Sep 17 00:00:00 2001 From: "M. Bahoosh" <12122474+the-moisrex@users.noreply.github.com> Date: Sat, 21 Dec 2024 01:18:19 -1000 Subject: [PATCH] Benchmarking next_code_point; #576 --- benchmarks/CMakeLists.txt | 1 + benchmarks/common_utils_pch.hpp | 50 ++++ benchmarks/utf_convertion/Makefile | 32 ++ benchmarks/utf_convertion/README.md | 12 + .../utf_conversion_benchmark.cpp | 280 ++++++++++++++++++ tests/unicode_test.cpp | 2 +- webpp/unicode/unicode.hpp | 6 +- 7 files changed, 379 insertions(+), 4 deletions(-) create mode 100644 benchmarks/utf_convertion/Makefile create mode 100644 benchmarks/utf_convertion/README.md create mode 100644 benchmarks/utf_convertion/utf_conversion_benchmark.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index fc79fc71..223681ea 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -37,6 +37,7 @@ set(FILE_SOURCES uri/uri_benchmark.cpp charset/charset_benchmark.cpp interleave_bits/interleave_benchmark.cpp + utf_convertion/utf_conversion_benchmark.cpp ) file(GLOB FILE_PCH *_pch.hpp) diff --git a/benchmarks/common_utils_pch.hpp b/benchmarks/common_utils_pch.hpp index 803a29c7..7cef5f51 100644 --- a/benchmarks/common_utils_pch.hpp +++ b/benchmarks/common_utils_pch.hpp @@ -20,6 +20,56 @@ static StrType str_generator( return str.substr(0, size); } +static std::u8string str8_generator(std::size_t size = 10'000) { + std::u8string str; + str.reserve(size); + + std::mt19937 generator(std::random_device{}()); + + for (std::size_t i = 0; i < size; ++i) { + // Generate a random code point + std::uniform_int_distribution distribution(0, 0x10'FFFF); // Unicode range + + char32_t const code_point = distribution(generator); + + // Skip invalid or non-characters + if ( + (code_point >= 0xD800 && code_point <= 0xDFFF) || (code_point == 0xFFFE) || (code_point == 0xFFFF)) + { + --i; // Decrement i to try again + continue; + } + + // Convert code point to UTF-8 + if (code_point <= 0x7F) { // 1-byte sequence + str += static_cast(code_point); + } else if (code_point <= 0x7FF) { // 2-byte sequence + str += static_cast(0xC0 | (code_point >> 6)); + str += static_cast(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0xFFFF) { // 3-byte sequence + str += static_cast(0xE0 | (code_point >> 12)); + str += static_cast(0x80 | ((code_point >> 6) & 0x3F)); + str += static_cast(0x80 | (code_point & 0x3F)); + } else { // 4-byte sequence + str += static_cast(0xF0 | (code_point >> 18)); + str += static_cast(0x80 | ((code_point >> 12) & 0x3F)); + str += static_cast(0x80 | ((code_point >> 6) & 0x3F)); + str += static_cast(0x80 | (code_point & 0x3F)); + } + } + + return str; +} + +template +static std::array str8_array_generator(std::size_t size = 10'000) { + std::array strs; + for (auto& str : strs) { + str = str8_generator(size); + } + return strs; +} + template static std::array str_array_generator( std::size_t size = 10'000, diff --git a/benchmarks/utf_convertion/Makefile b/benchmarks/utf_convertion/Makefile new file mode 100644 index 00000000..741c06f7 --- /dev/null +++ b/benchmarks/utf_convertion/Makefile @@ -0,0 +1,32 @@ +flags = -std=c++20 -isystem /usr/local/include -L/usr/local/lib -lpthread -lbenchmark_main -lbenchmark +optflags = -flto -Ofast -DNDEBUG -march=native -mtune=native +files = utf_conversion_benchmark.cpp + +all: gcc +.PHONY: all + +gcc: $(files) + g++ $(flags) $(optflags) $(files) + +clang: $(files) + clang++ $(flags) $(optflags) $(files) + +gcc-noopt: $(files) + g++ $(flags) $(files) + +clang-noopt: $(files) + clang++ $(flags) $(files) + +gcc-profile-generate: $(files) + g++ $(flags) $(optflags) -fprofile-generate $(files) + +clang-profile-generate: $(files) + clang++ $(flags) $(optflags) -fprofile-generate $(files) + +gcc-profile-use: $(files) + g++ $(flags) $(optflags) -fprofile-use $(files) + +clang-profile-use: $(files) + clang++ $(flags) $(optflags) -fprofile-use $(files) + + diff --git a/benchmarks/utf_convertion/README.md b/benchmarks/utf_convertion/README.md new file mode 100644 index 00000000..5655a61a --- /dev/null +++ b/benchmarks/utf_convertion/README.md @@ -0,0 +1,12 @@ +# UTF conversions benchmark + +``` +----------------------------------------------------- +Benchmark Time CPU Iterations +----------------------------------------------------- +UTFConv_v1 19903 ns 19867 ns 34903 +UTFConv_v2 24368 ns 24171 ns 29093 +``` + +The version 2 does more things obviously. + diff --git a/benchmarks/utf_convertion/utf_conversion_benchmark.cpp b/benchmarks/utf_convertion/utf_conversion_benchmark.cpp new file mode 100644 index 00000000..38140e3a --- /dev/null +++ b/benchmarks/utf_convertion/utf_conversion_benchmark.cpp @@ -0,0 +1,280 @@ +#include "../../webpp/unicode/unicode.hpp" +#include "../benchmark.hpp" +#include "../common_utils_pch.hpp" + +// NOLINTBEGIN(*-magic-numbers) +namespace v1 { + + using namespace webpp; + using namespace webpp::unicode; + using namespace webpp::unicode::unchecked; + + template + [[nodiscard]] static constexpr CodePointType next_code_point(Iter& pos, EIter end) noexcept { + using code_point_type = CodePointType; + using char_type = typename stl::iterator_traits::value_type; + using unsigned_char_type = stl::make_unsigned_t; + + if (pos == end) { + return static_cast(0); + } + + // double casting to make sure negative values can't come out of it + auto val = static_cast(static_cast(*pos++)); + if (pos == end) { + return val; + } + if constexpr (UTF16) { + if ((val & 0xFC00U) == 0xD800U) { + // we have two chars + val &= 0x3FFU; + val <<= 10U; + val |= static_cast(*pos++) & 0x3FFU; + val += 0x1'0000U; + return val; + } + return val; // this is the only char + } else if constexpr (UTF8) { + if ((val & 0b1000'0000U) == 0) { + // we have one char + return val; + } + if ((val & 0b1110'0000U) == 0b1100'0000U) { + // we have 2 chars + val &= 0b0001'1111U; + val <<= 6U; + val |= static_cast(*pos++) & 0b0011'1111U; + return val; + } + if ((val & 0b1111'0000U) == 0b1110'0000U) { + // we have 3 chars + val &= 0b0000'1111U; + val <<= 12U; + val |= (static_cast(*pos) & 0b0011'1111U) << 6U; + if (++pos == end) { + return *stl::prev(--pos); // bad code point found, return the first code unit + } + val |= static_cast(*pos++) & 0b0011'1111U; + return val; + } + if ((val & 0b1111'1000U) == 0b1111'0000U) { + // we have 4 chars + val &= 0b0000'0111U; + val <<= 18U; + val |= (static_cast(*pos) & 0b0011'1111U) << 12U; + if (++pos == end) { + return *stl::prev(--pos); // bad code point found, return the first code unit + } + val |= (static_cast(*pos) & 0b0011'1111U) << 6U; + if (++pos == end) { + stl::advance(pos, -2); + return *stl::prev(pos); // bad code point found, return the first code unit + } + val |= static_cast(*pos++) & 0b0011'1111U; + return val; + } + return val; // return this one anyway + } else { + return val; + } + } + + std::u32string utf8_to_utf32(std::u8string const& src) { + std::u32string out; + out.reserve(src.length() * 4); // Estimate maximum size of UTF-8 string + + auto pos = src.begin(); + while (pos != src.end()) { + auto const code_point = next_code_point(pos, src.end()); + if (code_point == 0) { + break; + } + out.push_back(code_point); + } + out.resize(pos - src.begin()); + + return out; + } + + +} // namespace v1 + +namespace v2 { + using namespace webpp; + using namespace webpp::unicode; + using namespace webpp::unicode::checked; + + template + [[nodiscard]] static constexpr CodePointType next_code_point(Iter& pos, Iter const& end) noexcept { + using enum error_handling; + using code_point_type = CodePointType; + using iter_traits = stl::iterator_traits; + using char_type = typename iter_traits::value_type; + using unsigned_char_type = stl::make_unsigned_t; + using difference_type = typename iter_traits::difference_type; + + if (pos == end) { + return static_cast(0); // return \0 if we're at the end already + } + + auto const cu1 = static_cast(static_cast(*pos++)); + auto code_point = cu1; + + // we're in a constexpr land, we can't use goto; damn all of you developers who think goto + // is not good enough for you; well, guess what, you're not smart enough to use goto. + for (;;) { + // double casting to make sure negative values can't come out of it + if constexpr (UTF32) { + return cu1; + } else if constexpr (UTF16) { + if (pos == end) [[unlikely]] { + break; + } + auto const cu2 = static_cast(static_cast(*pos++)); + bool error = (cu1 & 0xFC00U) != 0xD800U; + error |= (cu2 & 0xFC00U) != 0xDC00U; + // we have two chars + code_point &= 0x3FFU; + code_point <<= 10U; + code_point |= cu2 & 0x3FFU; + code_point += 0x1'0000U; + if (error) [[unlikely]] { + --pos; + code_point = cu1; + break; + } + return code_point; + } else if constexpr (UTF8) { + auto const len = required_length_of(cu1); + if (end - pos < len - 1) [[unlikely]] { + break; + } + switch (len) { + case 1: + if ((cu1 & 0b1000'0000U) != 0) [[unlikely]] { + break; + } + return cu1; + case 2: { + auto const cu2 = + static_cast(static_cast(*pos++)); + bool error = (cu1 & 0b1110'0000U) != 0b1100'0000U; + error |= (cu2 & 0b1100'0000U) != 0b1000'0000U; + code_point &= 0b0001'1111U; + code_point <<= 6U; + code_point |= cu2 & 0b0011'1111U; + if (error || code_point < 0x80 || 0x7ff < code_point) [[unlikely]] { + --pos; + code_point = cu1; + break; + } + return code_point; + } + case 3: { + auto const cu2 = + static_cast(static_cast(*pos++)); + auto const cu3 = + static_cast(static_cast(*pos++)); + bool error = (cu1 & 0b1111'0000U) != 0b1110'0000U; + error |= (cu2 & 0b1100'0000U) != 0b1000'0000U; + error |= (cu3 & 0b1100'0000U) != 0b1000'0000U; + code_point &= 0b0000'1111U; + code_point <<= 12U; + code_point |= (cu2 & 0b0011'1111U) << 6U; + code_point |= cu3 & 0b0011'1111U; + if (error || code_point < 0x800U || 0xFFFFU < code_point || + (0xD7FFU < code_point && code_point < 0xE000U)) [[unlikely]] + { + stl::advance(pos, -2); + code_point = cu1; + break; + } + return code_point; + } + case 4: { + auto const cu2 = + static_cast(static_cast(*pos++)); + auto const cu3 = + static_cast(static_cast(*pos++)); + auto const cu4 = + static_cast(static_cast(*pos++)); + bool error = (cu1 & 0b1111'0000) != 0b1111'0000; + error |= (cu2 & 0b1100'0000U) != 0b1000'0000U; + error |= (cu3 & 0b1100'0000U) != 0b1000'0000U; + error |= (cu4 & 0b1100'0000U) != 0b1000'0000U; + code_point &= 0b0000'0111U; + code_point <<= 18U; + code_point |= (cu2 & 0b0011'1111U) << 12U; + code_point |= (cu3 & 0b0011'1111U) << 6U; + code_point |= cu4 & 0b0011'1111U; + if (error || code_point <= 0xFFFFU || 0x10'FFFFU < code_point) [[unlikely]] { + stl::advance(pos, -3); + code_point = cu1; + break; + } + return code_point; + } + default: break; + } + break; + + } else { + static_assert_false(char_type, "Invalid code unit type."); + return cu1; + } + } + + // handle errors: + if constexpr (ErrorHandling == return_replacement_char) { + return replacement_char; + } else if constexpr (ErrorHandling == return_negated_char) { + return -code_point; + } else { + return code_point; + } + } + + std::u32string utf8_to_utf32(std::u8string const& src) { + std::u32string out; + out.reserve(src.length() * 4); // Estimate maximum size of UTF-8 string + + auto pos = src.begin(); + while (pos != src.end()) { + auto const code_point = next_code_point(pos, src.end()); + if (code_point == 0) { + break; + } + out.push_back(code_point); + } + out.resize(pos - src.begin()); + + return out; + } + +} // namespace v2 + +auto const str = str8_generator(10'000); + +static void UTFConv_v1(benchmark::State& state) { + for ([[maybe_unused]] auto _ : state) { + auto res = v1::utf8_to_utf32(str); + benchmark::DoNotOptimize(res); + } +} + +BENCHMARK(UTFConv_v1); + +static void UTFConv_v2(benchmark::State& state) { + for ([[maybe_unused]] auto _ : state) { + auto res = v2::utf8_to_utf32(str); + benchmark::DoNotOptimize(res); + } +} + +BENCHMARK(UTFConv_v2); + +// NOLINTEND(*-magic-numbers) diff --git a/tests/unicode_test.cpp b/tests/unicode_test.cpp index f1ff9501..7b513fad 100644 --- a/tests/unicode_test.cpp +++ b/tests/unicode_test.cpp @@ -6914,7 +6914,7 @@ TEST(Unicode, CheckedNextCodePoint) { using enum webpp::unicode::checked::error_handling; std::u8string str = u8"\xac"; - EXPECT_EQ(next_code_point_copy(str.begin(), str.end()), U'\xac'); + EXPECT_EQ(next_code_point_copy(str.begin(), str.end()), U'\xac'); EXPECT_EQ(next_code_point_copy(str.begin(), str.end()), -U'\xac'); EXPECT_EQ(next_code_point_copy(str.begin(), str.end()), replacement_char); diff --git a/webpp/unicode/unicode.hpp b/webpp/unicode/unicode.hpp index 1c84bce2..6ab370c3 100644 --- a/webpp/unicode/unicode.hpp +++ b/webpp/unicode/unicode.hpp @@ -948,12 +948,12 @@ namespace webpp::unicode { } enum struct error_handling : stl::uint8_t { - return_unchanged = 0, - return_replacement_char = 1, + return_replacement_char = 0, + return_unchanged = 1, return_negated_char = 2, }; - template [[nodiscard]] static constexpr CodePointType next_code_point(Iter& pos, Iter const& end) noexcept {