fluent · edsiper · Nov 29, 2021 · Nov 9, 2021
@@ -47,6 +47,14 @@ extern struct flb_aws_error_reporter *error_reporter;
 #include <openssl/rand.h>
 #endif
 
+/*
+ * The following block descriptor describes the private use unicode character range
+ * used for denoting invalid utf-8 fragments. Invalid fragment 0xCE would become
+ * utf-8 codepoint U+E0CE if FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR is set to
+ * E0 since U+E0CE = U+<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR><HEX_FRAGMENT>
+ */
+#define FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR 0xE0
+
 void flb_utils_error(int err)
 {
     char *msg = NULL;
@@ -636,6 +644,9 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
     int required;
     int len;
     int hex_bytes;
+    int is_valid;
+    int utf_sequence_number;
+    int utf_sequence_length;
     uint32_t codepoint;
     uint32_t state = 0;
     char tmp[16];
@@ -732,20 +743,102 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
             i += (hex_bytes - 1);
         }
         else if (c > 0xFFFF) {
-            hex_bytes = flb_utf8_len(str + i);
-            if (available - written < 6) {
-                return FLB_FALSE;
-            }
+            utf_sequence_length = flb_utf8_len(str + i);
 
-            if (i + hex_bytes > str_len) {
+            if (i + utf_sequence_length > str_len) {
                 break; /* skip truncated UTF-8 */
             }
-            for (b = 0; b < hex_bytes; b++) {
-                tmp[b] = str[i+b];
+
+            is_valid = FLB_TRUE;
+            for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length;
+                utf_sequence_number++) {
+                /* Leading characters must start with bits 11 */
+                if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) {
+                    /* Invalid unicode character. replace */
+                    flb_debug("[pack] unexpected UTF-8 leading byte, "
+                             "substituting character with replacement character");
+                    tmp[utf_sequence_number] = str[i];
+                    ++i; /* Consume invalid leading byte */
+                    utf_sequence_length = utf_sequence_number + 1;
+                    is_valid = FLB_FALSE;
+                    break;
+                }
+                /* Trailing characters must start with bits 10 */
+                else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) {
+                    /* Invalid unicode character. replace */
+                    flb_debug("[pack] unexpected UTF-8 continuation byte, "
+                             "substituting character with replacement character");
+                    /* This byte, i, is the start of the next unicode character */
+                    utf_sequence_length = utf_sequence_number;
+                    is_valid = FLB_FALSE;
+                    break;
+                }
+
+                tmp[utf_sequence_number] = str[i];
+                ++i;
+            }
+            --i;
+
+            if (is_valid) {
+                if (available - written < utf_sequence_length) {
+                    return FLB_FALSE;
+                }
+
+                encoded_to_buf(p, tmp, utf_sequence_length);
+                p += utf_sequence_length;
+            }
+            else {
+                if (available - written < utf_sequence_length * 3) {
+                    return FLB_FALSE;
+                }
+
+                /*
+                 * Utf-8 sequence is invalid. Map fragments to private use area
+                 * codepoints in range:
+                 * 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>00 to
+                 * 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>FF
+                 */
+                for (b = 0; b < utf_sequence_length; ++b) {
+                    /*
+                     * Utf-8 private block invalid hex mapping. Format unicode charpoint
+                     * in the following format:
+                     *
+                     *      +--------+--------+--------+
+                     *      |1110PPPP|10PPPPHH|10HHHHHH|
+                     *      +--------+--------+--------+
+                     *
+                     * Where:
+                     *   P is FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR bits (1 byte)
+                     *   H is Utf-8 fragment hex bits (1 byte)
+                     *   1 is bit 1
+                     *   0 is bit 0
+                     */
+
+                    /* unicode codepoint start */
+                    *p = 0xE0;
+
+                    /* print unicode private block header first 4 bits */
+                    *p |= FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR >> 4;
+                    ++p;
+
+                    /* unicode codepoint middle */
+                    *p = 0x80;
+
+                    /* print end of unicode private block header last 4 bits */
+                    *p |= ((FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR << 2) & 0x3f);
+
+                    /* print hex fragment first 2 bits */
+                    *p |= (tmp[b] >> 6) & 0x03;
+                    ++p;
+
+                    /* unicode codepoint middle */
+                    *p = 0x80;
+
+                    /* print hex fragment last 6 bits */
+                    *p |= tmp[b] & 0x3f;
+                    ++p;
+                }
             }
-            encoded_to_buf(p, tmp, hex_bytes);
-            p += hex_bytes;
-            i += (hex_bytes - 1);
         }
         else {
             *p++ = c;

@@ -16,6 +16,13 @@ struct url_check {
     char *uri;     /* expected uri      */
 };
 
+struct write_str_case {
+    char *input;
+    int input_len;
+    char *output;
+    int ret;
+};
+
 struct url_check url_checks[] = {
     {0, "https://fluentbit.io/something",
      "https", "fluentbit.io", "443", "/something"},
@@ -112,6 +119,54 @@ void test_url_split()
     }
 }
 
+/* test case loop for flb_utils_write_str */
+static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size);
+static void write_str_test_cases(struct write_str_case *cases) {
+    write_str_test_cases_w_buf_size(cases, 100);
+}
+
+/* test case loop for flb_utils_write_str */
+static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size) {
+    char *buf = flb_calloc(buf_size + 1, sizeof(char));
+    int size = buf_size + 1;
+    int off;
+    int ret;
+
+    struct write_str_case *tcase = cases;
+    while (!(tcase->input == 0 && tcase->output == 0)) {
+        memset(buf, 0, size);
+        off = 0;
+        ret = flb_utils_write_str(buf, &off, buf_size, tcase->input, tcase->input_len);
+
+        if(!TEST_CHECK(ret == tcase->ret)) {
+            TEST_MSG("Input string: %s", tcase->input);
+            TEST_MSG("| Expected return value: %s", (tcase->ret == FLB_TRUE) ? "FLB_TRUE"
+            : "FLB_FALSE");
+            TEST_MSG("| Produced return value: %s", (ret == FLB_TRUE) ? "FLB_TRUE"
+            : "FLB_FALSE");
+        }
+        if(!TEST_CHECK(memcmp(buf, tcase->output, off) == 0)) {
+            TEST_MSG("Input string: %s", tcase->input);
+            TEST_MSG("| Expected output: %s", tcase->output);
+            TEST_MSG("| Produced output: %s", buf);
+        }
+        if (!TEST_CHECK(strlen(buf) == strlen(tcase->output))) {
+            TEST_MSG("Input string: %s", tcase->input);
+            TEST_MSG("| Expected length: %zu", strlen(tcase->output));
+            TEST_MSG("| Produced length: %zu", strlen(buf));
+            TEST_MSG("| Expected output: %s", tcase->output);
+            TEST_MSG("| Produced output: %s", buf);
+        }
+        if (!TEST_CHECK(buf[size-1] == 0)) {
+            TEST_MSG("Out buffer overwrite detected '%c'", buf[size-1]);
+        }
+
+        ++tcase;
+    }
+
+    flb_free(buf);
+}
+
 void test_write_str()
 {
     char buf[10];
@@ -147,6 +202,162 @@ void test_write_str()
     TEST_CHECK(ret == FLB_FALSE);
 }
 
+void test_write_str_invalid_trailing_bytes()
+{
+    struct write_str_case cases[] = {
+        /* Invalid unicode (one bad trailing bytes) */
+        {
+            "\xe3\x81\x01""abc", 6,  /* note that 0x01 is an invalid byte */
+            "\xee\x83\xa3" /* e3 fragment */ /* replace invalid unicode */
+            "\xee\x82\x81" /* 81 fragment */
+            "\\u0001abc",
+            FLB_TRUE
+        },
+        /*
+         * Invalid unicode (two bad trailing bytes)
+         */
+        {
+            "\xe3\x01\x01""abc", 6,
+            "\xee\x83\xa3" /* e3 fragment */
+            "\\u0001\\u0001abc",
+            FLB_TRUE
+        },
+        { 0 }
+    };
+
+    write_str_test_cases(cases);
+}
+
+void test_write_str_invalid_leading_byte()
+{
+
+    struct write_str_case cases[] = {
+        /*
+         * Escaped leading hex (two hex, one valid unicode)
+         */
+        {
+            "\x00\x01\xe3\x81\x82""abc", 8,  /* note that 0x01 is an invalid byte */
+            "\\u0000\\u0001""\xe3\x81\x82""abc",  /* escape hex */
+            FLB_TRUE
+        },
+        /*
+         * Invalid unicode fragment (two byte fragment)
+         * note that 0xf3 is a leading byte with 3 trailing bytes. note that 0xe3 is also a
+         * leading byte with 2 trailing bytes. This should not be consumed by 0xf3 invalid
+         * unicode character
+         */
+        {
+            "\xf3\x81\x81\xe3\x81\x82""abc", 9,  /* note that 0xf3 0x81 0x81 is an invalid fragment */
+            "\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
+            "\xee\x82\x81" /* 81 fragment */
+            "\xee\x82\x81" /* 81 fragment */
+            "\xe3\x81\x82""abc", /* valid unicode */
+            FLB_TRUE
+        },
+        /*
+         * Invalid unicode (one bad leading byte + one bad trailing byte)
+         * note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte
+         */
+        {
+            "\xf3\x81\x01\xe3\x81\x82""abc", 9,  /* note that 0x01 is an invalid byte */
+            "\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
+            "\xee\x82\x81" /* 81 fragment */
+            "\\u0001""\xe3\x81\x82""abc",
+            FLB_TRUE
+        },
+        { 0 }
+    };
+
+    write_str_test_cases(cases);
+}
+
+void test_write_str_invalid_leading_byte_case_2()
+{
+
+    struct write_str_case cases[] = {
+        /* Invalid leading bytes */
+        {
+            "\x81\x82""abc", 5,  /* note that 0x81 & 0x82 are invalid leading bytes */
+            "\xee\x82\x81" /* 81 fragment */ /* replace invalid unicode */
+            "\xee\x82\x82" /* 82 fragment */
+            "abc",
+            FLB_TRUE
+        },
+        /*
+         * Invalid unicode (one bad leading byte + one bad trailing byte + one bad leading byte)
+         * note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte
+         * 0x81 & 0x82 are invalid leading bytes
+         */
+        {
+            "\xf3\x81\x01\x81\x82""abc", 8,  /* note that 0x81 & 0x82 are invalid leading bytes */
+            "\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
+            "\xee\x82\x81" /* 81 fragment */
+            "\\u0001"      /* 0x01 hex escape */
+            "\xee\x82\x81" /* 81 fragment */
+            "\xee\x82\x82" /* 82 fragment */
+            "abc",
+            FLB_TRUE
+        },
+        { 0 }
+    };
+
+    write_str_test_cases(cases);
+}
+
+void test_write_str_edge_cases()
+{
+    struct write_str_case cases[] = {
+        /* Invalid unicode (one bad leading byte) */
+        {
+            "\xf3", 1,  /* will this buffer overrun? */
+            "",  /* discard invalid unicode */
+            FLB_TRUE
+        },
+        { 0 }
+    };
+
+    write_str_test_cases(cases);
+}
+
+void test_write_str_buffer_overrun()
+{
+    struct write_str_case cases[] = {
+        {
+            "aa""\x81", 3,
+            "aa"
+            "\xee\x82\x81", /* just enough space for 81 fragment */
+            FLB_TRUE
+        },
+        {
+            "aaa""\x81", 4, /* out buffer size: 5, needed bytes: 2 + 3 + 3 = 8 */
+            "aaa",
+            /* "\xee\x82\x81", */ /* 81 fragment -- would overrun */
+            FLB_FALSE
+        },
+        {
+            "aaa"
+            "\xe3\x81\x82", 6, /* required is already grater than buffer */
+            "",
+            FLB_FALSE
+        },
+        {
+            "\""
+            "\xe3\x81\x82", 4, /* valid unicode */
+            "\\\"""\xe3\x81\x82", /* just enough space for valid unicode */
+            FLB_TRUE
+        },
+        {
+            "\x81"
+            "\xe3\x81\x82", 4, /* valid unicode */
+            "\xee\x82\x81", /* 81 fragment */
+            /* not enough space for valid unicode fragment "\xe3\x81\x82" */
+            FLB_FALSE
+        },
+        { 0 }
+    };
+    write_str_test_cases_w_buf_size(cases, 5);
+}
+
 struct proxy_url_check {
     int ret;
     char *url;        /* full URL          */
@@ -264,6 +475,11 @@ TEST_LIST = {
     /* JSON maps iteration */
     { "url_split", test_url_split },
     { "write_str", test_write_str },
+    { "test_write_str_invalid_trailing_bytes", test_write_str_invalid_trailing_bytes },
+    { "test_write_str_invalid_leading_byte", test_write_str_invalid_leading_byte },
+    { "test_write_str_edge_cases", test_write_str_edge_cases },
+    { "test_write_str_invalid_leading_byte_case_2", test_write_str_invalid_leading_byte_case_2 },
+    { "test_write_str_buffer_overrun", test_write_str_buffer_overrun },
     { "proxy_url_split", test_proxy_url_split },
     { 0 }
 };