-
Notifications
You must be signed in to change notification settings - Fork 286
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix Utf8Decoder operator== handling of the last code point in the inp…
…ut (#1316) * Fix Utf8Decoder operator== handling of the last code point in the input While working on diagnostics for #1210 I observed that we were printing the caret ^ in the wrong place when it goes after the input. The way this works is we form the line of text to print, then decode the unicode encoding units, and when we hit the target, we stop and print ^: https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/parse.cpp#L51-L68 however, if the intended location for the ^ is the "end" of the line, we hit this bug: https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L273 The iterator only compares the last_ pointers, but both the "points at the last code point in the input" and "points to the end of the input" state set `next_ == last_`. See: https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L222-L226 This means that the points to the end and points one past the end iterator compare equal, so the loop in parse.cpp stops one position too early. Also adds a bunch of testing for this specific case, for other parts of Utf8Decoder, adds a way to parse the first code point without failing, makes all the operators 'hidden friends', and removes localized strings for bugs-in-vcpkg-itself. * Add noexcepts as requested by @Thomas1664
- Loading branch information
1 parent
5b8f9c4
commit 4e75e1c
Showing
7 changed files
with
158 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#include <vcpkg-test/util.h> | ||
|
||
#include <vcpkg/base/unicode.h> | ||
|
||
#include <iterator> | ||
|
||
using namespace vcpkg::Unicode; | ||
|
||
TEST_CASE ("Utf8Decoder valid", "[unicode]") | ||
{ | ||
const char32_t* expected = U""; | ||
const char* input = ""; | ||
SECTION ("hello") | ||
{ | ||
expected = U"hello"; | ||
input = "hello"; | ||
} | ||
|
||
SECTION ("all types of code points") | ||
{ | ||
expected = U"one: a two: \u00E9 three: \u672C four: \U0001F3C8"; | ||
input = "one: a two: \xC3\xA9 three: \xE6\x9C\xAC four: \xF0\x9F\x8F\x88"; | ||
} | ||
|
||
SECTION ("wtf-8 leading") | ||
{ | ||
// U+1F3C8 as WTF-8 | ||
static constexpr char32_t storage[] = {0xD83C, 0}; | ||
expected = storage; | ||
input = "\xED\xA0\xBC"; | ||
} | ||
|
||
SECTION ("wtf-8 trailing") | ||
{ | ||
// U+1F3C8 as WTF-8 | ||
static constexpr char32_t storage[] = {0xDFC8, 0}; | ||
expected = storage; | ||
input = "\xED\xBF\x88"; | ||
} | ||
|
||
auto input_end = input + strlen(input); | ||
Utf8Decoder decode(input); | ||
// strlen for char32_t: | ||
size_t expected_size = 0; | ||
for (auto* e = expected; *e; ++e) | ||
{ | ||
++expected_size; | ||
} | ||
|
||
auto decode_at_end = std::next(decode, expected_size); | ||
for (size_t idx = 0; idx < expected_size; ++idx) | ||
{ | ||
REQUIRE(decode != decode.end()); // compare sentinel | ||
REQUIRE(decode != decode_at_end); // compare iterator | ||
REQUIRE(*decode == expected[idx]); | ||
REQUIRE(!decode.is_eof()); | ||
char32_t decoded; | ||
auto pointer_to_current = decode.pointer_to_current(); | ||
REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError); | ||
REQUIRE(decoded == expected[idx]); | ||
char encoded[4]; | ||
auto encoded_size = utf8_encode_code_point(encoded, decoded); | ||
REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current)); | ||
++decode; | ||
} | ||
|
||
REQUIRE(decode == decode.end()); | ||
REQUIRE(decode == decode_at_end); | ||
} | ||
|
||
TEST_CASE ("Utf8Decoder first decode empty", "[unicode]") | ||
{ | ||
utf8_errc err; | ||
Utf8Decoder uut("", err); | ||
REQUIRE(err == utf8_errc::NoError); | ||
REQUIRE(uut.is_eof()); | ||
REQUIRE(uut == uut.end()); | ||
REQUIRE(uut == uut); | ||
} | ||
|
||
TEST_CASE ("Utf8Decoder invalid", "[unicode]") | ||
{ | ||
utf8_errc err; | ||
// clang-format off | ||
Utf8Decoder uut(GENERATE( | ||
"hello \xFF too big", | ||
"hello \xC3\xBF\xBF\xBF also too big", | ||
"hello \x9C continuation", | ||
"hello \xE0\x28 overlong", | ||
"hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8", | ||
"missing two: \xC3", | ||
"missing three one: \xE6\x9C", | ||
"missing three two: \xE6", | ||
"missing four one: \xF0\x9F\x8F", | ||
"missing four two: \xF0\x9F", | ||
"missing four three: \xF0" | ||
), err); | ||
// clang-format on | ||
while (err == utf8_errc::NoError) | ||
{ | ||
REQUIRE(!uut.is_eof()); | ||
err = uut.next(); | ||
} | ||
|
||
REQUIRE(uut.is_eof()); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters