microsoft · BillyONeal · Jan 5, 2024 · Jan 3, 2024 · Jan 5, 2024
diff --git a/include/vcpkg/base/message-data.inc.h b/include/vcpkg/base/message-data.inc.h
@@ -866,10 +866,6 @@ DECLARE_MESSAGE(CommandFailed,
                 "{command_line}\n"
                 "failed with the following results:")
 DECLARE_MESSAGE(CommunityTriplets, (), "", "Community Triplets:")
-DECLARE_MESSAGE(ComparingUtf8Decoders,
-                (),
-                "",
-                "Comparing Utf8Decoders with different provenance; this is always an error")
 DECLARE_MESSAGE(CompressFolderFailed, (msg::path), "", "Failed to compress folder \"{path}\":")
 DECLARE_MESSAGE(ComputingInstallPlan, (), "", "Computing installation plan...")
 DECLARE_MESSAGE(ConfigurationErrorRegistriesWithoutBaseline,
@@ -1702,7 +1698,6 @@ DECLARE_MESSAGE(IllegalPlatformSpec, (), "", "Platform qualifier is not allowed
 DECLARE_MESSAGE(ImproperShaLength, (msg::value), "{value} is a sha.", "SHA512's must be 128 hex characters: {value}")
 DECLARE_MESSAGE(IncorrectArchiveFileSignature, (), "", "Incorrect archive file signature")
 DECLARE_MESSAGE(IncorrectPESignature, (), "", "Incorrect PE signature")
-DECLARE_MESSAGE(IncrementedUtf8Decoder, (), "", "Incremented Utf8Decoder at the end of the string")
 DECLARE_MESSAGE(InfoSetEnvVar,
                 (msg::env_var),
                 "In this context 'editor' means IDE",

diff --git a/include/vcpkg/base/unicode.h b/include/vcpkg/base/unicode.h
@@ -31,7 +31,6 @@ namespace vcpkg::Unicode
 
     Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;
 
-    constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
     int utf8_code_unit_count(char code_unit) noexcept;
 
     int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;
@@ -87,20 +86,20 @@ namespace vcpkg::Unicode
 
     bool utf8_is_valid_string(const char* first, const char* last) noexcept;
 
-    constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point)
+    constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point) noexcept
     {
         return code_point >= 0xD800 && code_point < 0xDC00;
     }
-    constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point)
+    constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point) noexcept
     {
         return code_point >= 0xDC00 && code_point < 0xE000;
     }
-    constexpr bool utf16_is_surrogate_code_point(char32_t code_point)
+    constexpr bool utf16_is_surrogate_code_point(char32_t code_point) noexcept
     {
         return code_point >= 0xD800 && code_point < 0xE000;
     }
 
-    char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing);
+    char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept;
 
     /*
         There are two ways to parse utf-8: we could allow unpaired surrogates (as in [wtf-8]) -- this is important
@@ -119,7 +118,7 @@ namespace vcpkg::Unicode
     struct Utf8Decoder
     {
         constexpr Utf8Decoder() noexcept : current_(end_of_file), next_(nullptr), last_(nullptr) { }
-        explicit constexpr Utf8Decoder(StringView sv) : Utf8Decoder(sv.begin(), sv.end()) { }
+        explicit constexpr Utf8Decoder(StringView sv) noexcept : Utf8Decoder(sv.begin(), sv.end()) { }
         constexpr Utf8Decoder(const char* first, const char* last) noexcept : current_(0), next_(first), last_(last)
         {
             if (next_ != last_)
@@ -131,13 +130,31 @@ namespace vcpkg::Unicode
                 current_ = end_of_file;
             }
         }
+        constexpr Utf8Decoder(StringView sv, utf8_errc& first_decode_error) noexcept
+            : Utf8Decoder(sv.begin(), sv.end(), first_decode_error)
+        {
+        }
+        constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept
+            : current_(0), next_(first), last_(last)
+        {
+            if (next_ != last_)
+            {
+                first_decode_error = next();
+            }
+            else
+            {
+                current_ = end_of_file;
+                first_decode_error = utf8_errc::NoError;
+            }
+        }
+
         struct sentinel
         {
         };
 
         constexpr inline bool is_eof() const noexcept { return current_ == end_of_file; }
 
-        [[nodiscard]] utf8_errc next();
+        [[nodiscard]] utf8_errc next() noexcept;
 
         Utf8Decoder& operator=(sentinel) noexcept;
 
@@ -161,7 +178,24 @@ namespace vcpkg::Unicode
 
         constexpr sentinel end() const { return sentinel(); }
 
-        friend bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept;
+        friend constexpr bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
+        {
+            if (lhs.last_ != rhs.last_)
+            {
+                // comparing decoders of different provenance is always an error
+                Checks::unreachable(VCPKG_LINE_INFO);
+            }
+
+            return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_;
+        }
+        friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+        friend constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return d.is_eof(); }
+        friend constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d == s; }
+        friend constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return !d.is_eof(); }
+        friend constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d != s; }
 
         using difference_type = std::ptrdiff_t;
         using value_type = char32_t;
@@ -174,11 +208,4 @@ namespace vcpkg::Unicode
         const char* next_;
         const char* last_;
     };
-
-    inline bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept { return !(lhs == rhs); }
-
-    constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) { return d.is_eof(); }
-    constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d == s; }
-    constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) { return !d.is_eof(); }
-    constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d != s; }
 }
diff --git a/locales/messages.json b/locales/messages.json
@@ -511,7 +511,6 @@
   "CommandFailed": "command:\n{command_line}\nfailed with the following results:",
   "_CommandFailed.comment": "An example of {command_line} is vcpkg install zlib.",
   "CommunityTriplets": "Community Triplets:",
-  "ComparingUtf8Decoders": "Comparing Utf8Decoders with different provenance; this is always an error",
   "CompressFolderFailed": "Failed to compress folder \"{path}\":",
   "_CompressFolderFailed.comment": "An example of {path} is /foo/bar.",
   "ComputingInstallPlan": "Computing installation plan...",
@@ -972,7 +971,6 @@
   "_ImproperShaLength.comment": "{value} is a sha.",
   "IncorrectArchiveFileSignature": "Incorrect archive file signature",
   "IncorrectPESignature": "Incorrect PE signature",
-  "IncrementedUtf8Decoder": "Incremented Utf8Decoder at the end of the string",
   "InfoSetEnvVar": "You can also set {env_var} to your editor of choice.",
   "_InfoSetEnvVar.comment": "In this context 'editor' means IDE An example of {env_var} is VCPKG_DEFAULT_TRIPLET.",
   "InitRegistryFailedNoRepo": "Could not create a registry at {path} because this is not a git repository root.\nUse `git init {command_line}` to create a git repository in this folder.",

diff --git a/src/vcpkg-test/ci-baseline.cpp b/src/vcpkg-test/ci-baseline.cpp
@@ -258,7 +258,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
 {
     check_error("hello", R"(test:1:6: error: expected ':' here
   on expression: hello
-                     ^)");
+                      ^)");
 
     check_error("hello\n:", R"(test:1:6: error: expected ':' here
   on expression: hello
@@ -271,7 +271,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
 
     check_error("x64-windows:", R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
   on expression: x64-windows:
-                            ^)");
+                             ^)");
 
     check_error("x64-windows:\nport:x64-windows=skip",
                 R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
@@ -285,7 +285,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
     // clang-format off
     check_error("   \tx64-windows:", R"(test:1:21: error: expected a triplet name here (must be lowercase, digits, '-')
   on expression:    )" "\t" R"(x64-windows:
-                    )" "\t" R"(           ^)");
+                    )" "\t" R"(            ^)");
     // clang-format on
 
     check_error("port:x64-windows\n=fail", R"(test:1:17: error: expected '=' here

diff --git a/src/vcpkg-test/manifests.cpp b/src/vcpkg-test/manifests.cpp
@@ -1267,7 +1267,7 @@ TEST_CASE ("license error messages", "[manifests][license]")
     CHECK(messages.error->to_string() ==
           R"(<license string>:1:8: error: Expected a license name, found the end of the string.
   on expression: MIT AND
-                       ^)");
+                        ^)");
 
     parse_spdx_license_expression("MIT AND unknownlicense", messages);
     CHECK(!messages.error);

diff --git a/src/vcpkg-test/unicode.cpp b/src/vcpkg-test/unicode.cpp
@@ -0,0 +1,106 @@
+#include <vcpkg-test/util.h>
+
+#include <vcpkg/base/unicode.h>
+
+#include <iterator>
+
+using namespace vcpkg::Unicode;
+
+TEST_CASE ("Utf8Decoder valid", "[unicode]")
+{
+    const char32_t* expected = U"";
+    const char* input = "";
+    SECTION ("hello")
+    {
+        expected = U"hello";
+        input = "hello";
+    }
+
+    SECTION ("all types of code points")
+    {
+        expected = U"one: a two: \u00E9 three: \u672C four: \U0001F3C8";
+        input = "one: a two: \xC3\xA9 three: \xE6\x9C\xAC four: \xF0\x9F\x8F\x88";
+    }
+
+    SECTION ("wtf-8 leading")
+    {
+        // U+1F3C8 as WTF-8
+        static constexpr char32_t storage[] = {0xD83C, 0};
+        expected = storage;
+        input = "\xED\xA0\xBC";
+    }
+
+    SECTION ("wtf-8 trailing")
+    {
+        // U+1F3C8 as WTF-8
+        static constexpr char32_t storage[] = {0xDFC8, 0};
+        expected = storage;
+        input = "\xED\xBF\x88";
+    }
+
+    auto input_end = input + strlen(input);
+    Utf8Decoder decode(input);
+    // strlen for char32_t:
+    size_t expected_size = 0;
+    for (auto* e = expected; *e; ++e)
+    {
+        ++expected_size;
+    }
+
+    auto decode_at_end = std::next(decode, expected_size);
+    for (size_t idx = 0; idx < expected_size; ++idx)
+    {
+        REQUIRE(decode != decode.end());  // compare sentinel
+        REQUIRE(decode != decode_at_end); // compare iterator
+        REQUIRE(*decode == expected[idx]);
+        REQUIRE(!decode.is_eof());
+        char32_t decoded;
+        auto pointer_to_current = decode.pointer_to_current();
+        REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError);
+        REQUIRE(decoded == expected[idx]);
+        char encoded[4];
+        auto encoded_size = utf8_encode_code_point(encoded, decoded);
+        REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current));
+        ++decode;
+    }
+
+    REQUIRE(decode == decode.end());
+    REQUIRE(decode == decode_at_end);
+}
+
+TEST_CASE ("Utf8Decoder first decode empty", "[unicode]")
+{
+    utf8_errc err;
+    Utf8Decoder uut("", err);
+    REQUIRE(err == utf8_errc::NoError);
+    REQUIRE(uut.is_eof());
+    REQUIRE(uut == uut.end());
+    REQUIRE(uut == uut);
+}
+
+TEST_CASE ("Utf8Decoder invalid", "[unicode]")
+{
+    utf8_errc err;
+    // clang-format off
+    Utf8Decoder uut(GENERATE(
+        "hello \xFF too big",
+        "hello \xC3\xBF\xBF\xBF also too big",
+        "hello \x9C continuation",
+        "hello \xE0\x28 overlong",
+        "hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8",
+        "missing two: \xC3",
+        "missing three one: \xE6\x9C",
+        "missing three two: \xE6",
+        "missing four one: \xF0\x9F\x8F",
+        "missing four two: \xF0\x9F",
+        "missing four three: \xF0"
+    ), err);
+    // clang-format on
+    while (err == utf8_errc::NoError)
+    {
+        REQUIRE(!uut.is_eof());
+        err = uut.next();
+    }
+
+    REQUIRE(uut.is_eof());
+}
diff --git a/src/vcpkg/base/unicode.cpp b/src/vcpkg/base/unicode.cpp
@@ -31,6 +31,8 @@ namespace vcpkg::Unicode
         }
     }
 
+    static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
+
     int utf8_code_unit_count(char code_unit) noexcept
     {
         return utf8_code_unit_count(utf8_code_unit_kind(static_cast<unsigned char>(code_unit)));
@@ -174,7 +176,7 @@ namespace vcpkg::Unicode
         return err == utf8_errc::NoError;
     }
 
-    char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing)
+    char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept
     {
         vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_leading_surrogate_code_point(leading));
         vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_trailing_surrogate_code_point(trailing));
@@ -212,11 +214,12 @@ namespace vcpkg::Unicode
         return next_ - count;
     }
 
-    utf8_errc Utf8Decoder::next()
+    utf8_errc Utf8Decoder::next() noexcept
     {
         if (is_eof())
         {
-            vcpkg::Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgIncrementedUtf8Decoder);
+            // incremented Utf8Decoder at the end of the string
+            Checks::unreachable(VCPKG_LINE_INFO);
         }
 
         if (next_ == last_)
@@ -262,14 +265,4 @@ namespace vcpkg::Unicode
         current_ = end_of_file;
         return *this;
     }
-
-    bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
-    {
-        if (lhs.last_ != rhs.last_)
-        {
-            Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgComparingUtf8Decoders);
-        }
-
-        return lhs.next_ == rhs.next_;
-    }
 }