diff --git a/include/vcpkg/base/message-data.inc.h b/include/vcpkg/base/message-data.inc.h index 3e9672d83c..bd08ee1e25 100644 --- a/include/vcpkg/base/message-data.inc.h +++ b/include/vcpkg/base/message-data.inc.h @@ -866,10 +866,6 @@ DECLARE_MESSAGE(CommandFailed, "{command_line}\n" "failed with the following results:") DECLARE_MESSAGE(CommunityTriplets, (), "", "Community Triplets:") -DECLARE_MESSAGE(ComparingUtf8Decoders, - (), - "", - "Comparing Utf8Decoders with different provenance; this is always an error") DECLARE_MESSAGE(CompressFolderFailed, (msg::path), "", "Failed to compress folder \"{path}\":") DECLARE_MESSAGE(ComputingInstallPlan, (), "", "Computing installation plan...") DECLARE_MESSAGE(ConfigurationErrorRegistriesWithoutBaseline, @@ -1702,7 +1698,6 @@ DECLARE_MESSAGE(IllegalPlatformSpec, (), "", "Platform qualifier is not allowed DECLARE_MESSAGE(ImproperShaLength, (msg::value), "{value} is a sha.", "SHA512's must be 128 hex characters: {value}") DECLARE_MESSAGE(IncorrectArchiveFileSignature, (), "", "Incorrect archive file signature") DECLARE_MESSAGE(IncorrectPESignature, (), "", "Incorrect PE signature") -DECLARE_MESSAGE(IncrementedUtf8Decoder, (), "", "Incremented Utf8Decoder at the end of the string") DECLARE_MESSAGE(InfoSetEnvVar, (msg::env_var), "In this context 'editor' means IDE", diff --git a/include/vcpkg/base/unicode.h b/include/vcpkg/base/unicode.h index 75b163559a..89b931e2a5 100644 --- a/include/vcpkg/base/unicode.h +++ b/include/vcpkg/base/unicode.h @@ -31,7 +31,6 @@ namespace vcpkg::Unicode Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept; - constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast(kind); } int utf8_code_unit_count(char code_unit) noexcept; int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept; @@ -87,20 +86,20 @@ namespace vcpkg::Unicode bool utf8_is_valid_string(const char* first, const char* last) noexcept; - constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point) + constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point) noexcept { return code_point >= 0xD800 && code_point < 0xDC00; } - constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point) + constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point) noexcept { return code_point >= 0xDC00 && code_point < 0xE000; } - constexpr bool utf16_is_surrogate_code_point(char32_t code_point) + constexpr bool utf16_is_surrogate_code_point(char32_t code_point) noexcept { return code_point >= 0xD800 && code_point < 0xE000; } - char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing); + char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept; /* There are two ways to parse utf-8: we could allow unpaired surrogates (as in [wtf-8]) -- this is important @@ -119,7 +118,7 @@ namespace vcpkg::Unicode struct Utf8Decoder { constexpr Utf8Decoder() noexcept : current_(end_of_file), next_(nullptr), last_(nullptr) { } - explicit constexpr Utf8Decoder(StringView sv) : Utf8Decoder(sv.begin(), sv.end()) { } + explicit constexpr Utf8Decoder(StringView sv) noexcept : Utf8Decoder(sv.begin(), sv.end()) { } constexpr Utf8Decoder(const char* first, const char* last) noexcept : current_(0), next_(first), last_(last) { if (next_ != last_) @@ -131,13 +130,31 @@ namespace vcpkg::Unicode current_ = end_of_file; } } + constexpr Utf8Decoder(StringView sv, utf8_errc& first_decode_error) noexcept + : Utf8Decoder(sv.begin(), sv.end(), first_decode_error) + { + } + constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept + : current_(0), next_(first), last_(last) + { + if (next_ != last_) + { + first_decode_error = next(); + } + else + { + current_ = end_of_file; + first_decode_error = utf8_errc::NoError; + } + } + struct sentinel { }; constexpr inline bool is_eof() const noexcept { return current_ == end_of_file; } - [[nodiscard]] utf8_errc next(); + [[nodiscard]] utf8_errc next() noexcept; Utf8Decoder& operator=(sentinel) noexcept; @@ -161,7 +178,24 @@ namespace vcpkg::Unicode constexpr sentinel end() const { return sentinel(); } - friend bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept; + friend constexpr bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept + { + if (lhs.last_ != rhs.last_) + { + // comparing decoders of different provenance is always an error + Checks::unreachable(VCPKG_LINE_INFO); + } + + return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_; + } + friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept + { + return !(lhs == rhs); + } + friend constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return d.is_eof(); } + friend constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d == s; } + friend constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return !d.is_eof(); } + friend constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d != s; } using difference_type = std::ptrdiff_t; using value_type = char32_t; @@ -174,11 +208,4 @@ namespace vcpkg::Unicode const char* next_; const char* last_; }; - - inline bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept { return !(lhs == rhs); } - - constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) { return d.is_eof(); } - constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d == s; } - constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) { return !d.is_eof(); } - constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d != s; } } diff --git a/locales/messages.json b/locales/messages.json index 01a12a6aca..9c421a2340 100644 --- a/locales/messages.json +++ b/locales/messages.json @@ -511,7 +511,6 @@ "CommandFailed": "command:\n{command_line}\nfailed with the following results:", "_CommandFailed.comment": "An example of {command_line} is vcpkg install zlib.", "CommunityTriplets": "Community Triplets:", - "ComparingUtf8Decoders": "Comparing Utf8Decoders with different provenance; this is always an error", "CompressFolderFailed": "Failed to compress folder \"{path}\":", "_CompressFolderFailed.comment": "An example of {path} is /foo/bar.", "ComputingInstallPlan": "Computing installation plan...", @@ -972,7 +971,6 @@ "_ImproperShaLength.comment": "{value} is a sha.", "IncorrectArchiveFileSignature": "Incorrect archive file signature", "IncorrectPESignature": "Incorrect PE signature", - "IncrementedUtf8Decoder": "Incremented Utf8Decoder at the end of the string", "InfoSetEnvVar": "You can also set {env_var} to your editor of choice.", "_InfoSetEnvVar.comment": "In this context 'editor' means IDE An example of {env_var} is VCPKG_DEFAULT_TRIPLET.", "InitRegistryFailedNoRepo": "Could not create a registry at {path} because this is not a git repository root.\nUse `git init {command_line}` to create a git repository in this folder.", diff --git a/src/vcpkg-test/ci-baseline.cpp b/src/vcpkg-test/ci-baseline.cpp index e89c883b4a..337e7f95cb 100644 --- a/src/vcpkg-test/ci-baseline.cpp +++ b/src/vcpkg-test/ci-baseline.cpp @@ -258,7 +258,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]") { check_error("hello", R"(test:1:6: error: expected ':' here on expression: hello - ^)"); + ^)"); check_error("hello\n:", R"(test:1:6: error: expected ':' here on expression: hello @@ -271,7 +271,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]") check_error("x64-windows:", R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-') on expression: x64-windows: - ^)"); + ^)"); check_error("x64-windows:\nport:x64-windows=skip", R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-') @@ -285,7 +285,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]") // clang-format off check_error(" \tx64-windows:", R"(test:1:21: error: expected a triplet name here (must be lowercase, digits, '-') on expression: )" "\t" R"(x64-windows: - )" "\t" R"( ^)"); + )" "\t" R"( ^)"); // clang-format on check_error("port:x64-windows\n=fail", R"(test:1:17: error: expected '=' here diff --git a/src/vcpkg-test/manifests.cpp b/src/vcpkg-test/manifests.cpp index ce1fde2770..ce3a8a58d6 100644 --- a/src/vcpkg-test/manifests.cpp +++ b/src/vcpkg-test/manifests.cpp @@ -1267,7 +1267,7 @@ TEST_CASE ("license error messages", "[manifests][license]") CHECK(messages.error->to_string() == R"(:1:8: error: Expected a license name, found the end of the string. on expression: MIT AND - ^)"); + ^)"); parse_spdx_license_expression("MIT AND unknownlicense", messages); CHECK(!messages.error); diff --git a/src/vcpkg-test/unicode.cpp b/src/vcpkg-test/unicode.cpp new file mode 100644 index 0000000000..75e19a49b1 --- /dev/null +++ b/src/vcpkg-test/unicode.cpp @@ -0,0 +1,106 @@ +#include + +#include + +#include + +using namespace vcpkg::Unicode; + +TEST_CASE ("Utf8Decoder valid", "[unicode]") +{ + const char32_t* expected = U""; + const char* input = ""; + SECTION ("hello") + { + expected = U"hello"; + input = "hello"; + } + + SECTION ("all types of code points") + { + expected = U"one: a two: \u00E9 three: \u672C four: \U0001F3C8"; + input = "one: a two: \xC3\xA9 three: \xE6\x9C\xAC four: \xF0\x9F\x8F\x88"; + } + + SECTION ("wtf-8 leading") + { + // U+1F3C8 as WTF-8 + static constexpr char32_t storage[] = {0xD83C, 0}; + expected = storage; + input = "\xED\xA0\xBC"; + } + + SECTION ("wtf-8 trailing") + { + // U+1F3C8 as WTF-8 + static constexpr char32_t storage[] = {0xDFC8, 0}; + expected = storage; + input = "\xED\xBF\x88"; + } + + auto input_end = input + strlen(input); + Utf8Decoder decode(input); + // strlen for char32_t: + size_t expected_size = 0; + for (auto* e = expected; *e; ++e) + { + ++expected_size; + } + + auto decode_at_end = std::next(decode, expected_size); + for (size_t idx = 0; idx < expected_size; ++idx) + { + REQUIRE(decode != decode.end()); // compare sentinel + REQUIRE(decode != decode_at_end); // compare iterator + REQUIRE(*decode == expected[idx]); + REQUIRE(!decode.is_eof()); + char32_t decoded; + auto pointer_to_current = decode.pointer_to_current(); + REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError); + REQUIRE(decoded == expected[idx]); + char encoded[4]; + auto encoded_size = utf8_encode_code_point(encoded, decoded); + REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current)); + ++decode; + } + + REQUIRE(decode == decode.end()); + REQUIRE(decode == decode_at_end); +} + +TEST_CASE ("Utf8Decoder first decode empty", "[unicode]") +{ + utf8_errc err; + Utf8Decoder uut("", err); + REQUIRE(err == utf8_errc::NoError); + REQUIRE(uut.is_eof()); + REQUIRE(uut == uut.end()); + REQUIRE(uut == uut); +} + +TEST_CASE ("Utf8Decoder invalid", "[unicode]") +{ + utf8_errc err; + // clang-format off + Utf8Decoder uut(GENERATE( + "hello \xFF too big", + "hello \xC3\xBF\xBF\xBF also too big", + "hello \x9C continuation", + "hello \xE0\x28 overlong", + "hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8", + "missing two: \xC3", + "missing three one: \xE6\x9C", + "missing three two: \xE6", + "missing four one: \xF0\x9F\x8F", + "missing four two: \xF0\x9F", + "missing four three: \xF0" + ), err); + // clang-format on + while (err == utf8_errc::NoError) + { + REQUIRE(!uut.is_eof()); + err = uut.next(); + } + + REQUIRE(uut.is_eof()); +} diff --git a/src/vcpkg/base/unicode.cpp b/src/vcpkg/base/unicode.cpp index 5613be895b..ee44321d06 100644 --- a/src/vcpkg/base/unicode.cpp +++ b/src/vcpkg/base/unicode.cpp @@ -31,6 +31,8 @@ namespace vcpkg::Unicode } } + static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast(kind); } + int utf8_code_unit_count(char code_unit) noexcept { return utf8_code_unit_count(utf8_code_unit_kind(static_cast(code_unit))); @@ -174,7 +176,7 @@ namespace vcpkg::Unicode return err == utf8_errc::NoError; } - char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) + char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept { vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_leading_surrogate_code_point(leading)); vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_trailing_surrogate_code_point(trailing)); @@ -212,11 +214,12 @@ namespace vcpkg::Unicode return next_ - count; } - utf8_errc Utf8Decoder::next() + utf8_errc Utf8Decoder::next() noexcept { if (is_eof()) { - vcpkg::Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgIncrementedUtf8Decoder); + // incremented Utf8Decoder at the end of the string + Checks::unreachable(VCPKG_LINE_INFO); } if (next_ == last_) @@ -262,14 +265,4 @@ namespace vcpkg::Unicode current_ = end_of_file; return *this; } - - bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept - { - if (lhs.last_ != rhs.last_) - { - Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgComparingUtf8Decoders); - } - - return lhs.next_ == rhs.next_; - } }