Skip to content

Commit

Permalink
Fixing utf-16; #576
Browse files Browse the repository at this point in the history
  • Loading branch information
the-moisrex committed Dec 22, 2024
1 parent eb2e31f commit cae43fa
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 27 deletions.
28 changes: 17 additions & 11 deletions benchmarks/utf_convertion/utf_conversion_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,21 @@ namespace v2 {
if constexpr (UTF32<char_type>) {
return cu1;
} else if constexpr (UTF16<char_type>) {
if (pos == end) [[unlikely]] {
break;
bool const requires_2_units = (cu1 & 0xFC00U) == 0xD800U;
bool error = false;
if (requires_2_units) {
code_point &= 0x3FFU;
code_point <<= 10U;
if (pos == end) [[unlikely]] {
break;
}
auto const cu2 = static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
error |= (cu1 & 0xFC00U) != 0xD800U;
error |= (cu2 & 0xFC00U) != 0xDC00U;
code_point |= cu2 & 0x3FFU;
code_point += 0x1'0000U;
}
auto const cu2 = static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
bool error = (cu1 & 0xFC00U) != 0xD800U;
error |= (cu2 & 0xFC00U) != 0xDC00U;
// we have two chars
code_point &= 0x3FFU;
code_point <<= 10U;
code_point |= cu2 & 0x3FFU;
code_point += 0x1'0000U;
if (error) [[unlikely]] {
if (error || is_surrogate(code_point)) [[unlikely]] {
--pos;
code_point = cu1;
break;
Expand Down Expand Up @@ -250,6 +253,9 @@ namespace v2 {
if constexpr (ErrorHandling == return_replacement_char) {
return replacement_char<code_point_type>;
} else if constexpr (ErrorHandling == return_negated_char) {
static_assert(stl::is_unsigned_v<code_point_type>,
"The code point type should support negative values if you want us to return "
"negative values as errors.");
return -code_point;
} else {
return code_point;
Expand Down
29 changes: 18 additions & 11 deletions webpp/unicode/unicode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -978,18 +978,22 @@ namespace webpp::unicode {
if constexpr (UTF32<char_type>) {
return cu1;
} else if constexpr (UTF16<char_type>) {
if (pos == end) [[unlikely]] {
break;
bool const requires_2_units = (cu1 & 0xFC00U) == 0xD800U;
bool error = false;
if (requires_2_units) {
code_point &= 0x3FFU;
code_point <<= 10U;
if (pos == end) [[unlikely]] {
break;
}
auto const cu2 =
static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
error |= (cu1 & 0xFC00U) != 0xD800U;
error |= (cu2 & 0xFC00U) != 0xDC00U;
code_point |= cu2 & 0x3FFU;
code_point += 0x1'0000U;
}
auto const cu2 = static_cast<code_point_type>(static_cast<unsigned_char_type>(*pos++));
bool error = (cu1 & 0xFC00U) != 0xD800U;
error |= (cu2 & 0xFC00U) != 0xDC00U;
// we have two chars
code_point &= 0x3FFU;
code_point <<= 10U;
code_point |= cu2 & 0x3FFU;
code_point += 0x1'0000U;
if (error) [[unlikely]] {
if (error || is_surrogate(code_point)) [[unlikely]] {
--pos;
code_point = cu1;
break;
Expand Down Expand Up @@ -1079,6 +1083,9 @@ namespace webpp::unicode {
if constexpr (ErrorHandling == return_replacement_char) {
return replacement_char<code_point_type>;
} else if constexpr (ErrorHandling == return_negated_char) {
static_assert(stl::is_unsigned_v<code_point_type>,
"The code point type should support negative values if you want us to return "
"negative values as errors.");
return -code_point;
} else {
return code_point;
Expand Down
30 changes: 25 additions & 5 deletions webpp/unicode/utf_reducer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,10 +615,11 @@ namespace webpp::unicode {

[[nodiscard]] constexpr value_type operator*() const noexcept {
if constexpr (UTF32<unit_type>) {
return *iter();
return static_cast<value_type>(*iter());
} else {
using enum checked::error_handling;
assert(iter() < reducer->end());
return checked::next_code_point_copy(iter(), reducer->end());
return checked::next_code_point_copy<return_negated_char, value_type>(iter(), reducer->end());
}
}

Expand Down Expand Up @@ -744,12 +745,19 @@ namespace webpp::unicode {
if constexpr (UTF32<unit_type>) {
*iter() = *other;
} else {
auto const new_len = required_length_of<unit_type, stl::int_fast8_t>(*other);
auto const inp_code_point = *other;

// handling invalid code points
if (static_cast<stl::int32_t>(inp_code_point) < 0) [[unlikely]] {
*iter() = -static_cast<unit_type>(inp_code_point);
return;
}

auto const new_len = required_length_of<unit_type, stl::int_fast8_t>(inp_code_point);

// if new length is 0, a bad code point is given to the input.
assert(new_len != 0);

set_inplace(*other, new_len);
set_inplace(inp_code_point, new_len);
}
}

Expand All @@ -760,6 +768,12 @@ namespace webpp::unicode {
} else {
assert(iter() < reducer->endptr);

// handling invalid code points
if (static_cast<stl::int32_t>(inp_code_point) < 0) [[unlikely]] {
*iter() = -static_cast<unit_type>(inp_code_point);
return;
}

auto const new_len = required_length_of<unit_type, stl::int_fast8_t>(inp_code_point);
set_inplace(inp_code_point, new_len);
}
Expand All @@ -773,6 +787,12 @@ namespace webpp::unicode {
} else {
assert(iter() != reducer->endptr);

// handling invalid code points
if (static_cast<stl::int32_t>(inp_code_point) < 0) [[unlikely]] {
*iter() = -static_cast<unit_type>(inp_code_point);
return;
}

auto const cur_len = required_length_of<unit_type, stl::int_fast8_t>(*iter());
auto const new_len = utf_length_from_utf32<unit_type, stl::int_fast8_t>(inp_code_point);
auto const old_diff = cur_len - new_len;
Expand Down

0 comments on commit cae43fa

Please sign in to comment.