utils: detect and replace ill-formed utf-8 bytes (#4346)

Previously with unicode byte sequences such as 0xef 0xbf 0x00 ... Fluent Bit would blindly trust the first unicode byte 0xef to describe how many valid trailing unicode bytes to copy. If a trailing unicode byte is invalid, such as 0x00, the null character, the utility blindly copied this to the escaped string. This commit adds checks for leading and trailing byte utf-8 compliance. If invalid, the ill-formed character's bytes are individually mapped to private use area [U+E000 to U+E0FF] preserving ill-formed character data in a compact and safe utf-8 friendly format. Signed-off-by: Matthew Fala <[email protected]>
fluent · Nov 29, 2021 · 861af37 · 861af37
1 parent bf0f0d2
commit 861af37
Show file tree

Hide file tree

Showing 2 changed files with 319 additions and 10 deletions.
diff --git a/src/flb_utils.c b/src/flb_utils.c
@@ -47,6 +47,14 @@ extern struct flb_aws_error_reporter *error_reporter;
 #include <openssl/rand.h>
 #endif
 
+/*
+ * The following block descriptor describes the private use unicode character range
+ * used for denoting invalid utf-8 fragments. Invalid fragment 0xCE would become
+ * utf-8 codepoint U+E0CE if FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR is set to
+ * E0 since U+E0CE = U+<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR><HEX_FRAGMENT>
+ */
+#define FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR 0xE0
+
 void flb_utils_error(int err)
 {
     char *msg = NULL;
@@ -636,6 +644,9 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
     int required;
     int len;
     int hex_bytes;
+    int is_valid;
+    int utf_sequence_number;
+    int utf_sequence_length;
     uint32_t codepoint;
     uint32_t state = 0;
     char tmp[16];
@@ -732,20 +743,102 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
             i += (hex_bytes - 1);
         }
         else if (c > 0xFFFF) {
-            hex_bytes = flb_utf8_len(str + i);
-            if (available - written < 6) {
-                return FLB_FALSE;
-            }
+            utf_sequence_length = flb_utf8_len(str + i);
 
-            if (i + hex_bytes > str_len) {
+            if (i + utf_sequence_length > str_len) {
                 break; /* skip truncated UTF-8 */
             }
-            for (b = 0; b < hex_bytes; b++) {
-                tmp[b] = str[i+b];
+
+            is_valid = FLB_TRUE;
+            for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length;
+                utf_sequence_number++) {
+                /* Leading characters must start with bits 11 */
+                if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) {
+                    /* Invalid unicode character. replace */
+                    flb_debug("[pack] unexpected UTF-8 leading byte, "
+                             "substituting character with replacement character");
+                    tmp[utf_sequence_number] = str[i];
+                    ++i; /* Consume invalid leading byte */
+                    utf_sequence_length = utf_sequence_number + 1;
+                    is_valid = FLB_FALSE;
+                    break;
+                }
+                /* Trailing characters must start with bits 10 */
+                else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) {
+                    /* Invalid unicode character. replace */
+                    flb_debug("[pack] unexpected UTF-8 continuation byte, "
+                             "substituting character with replacement character");
+                    /* This byte, i, is the start of the next unicode character */
+                    utf_sequence_length = utf_sequence_number;
+                    is_valid = FLB_FALSE;
+                    break;
+                }
+
+                tmp[utf_sequence_number] = str[i];
+                ++i;
+            }
+            --i;
+
+            if (is_valid) {
+                if (available - written < utf_sequence_length) {
+                    return FLB_FALSE;
+                }
+
+                encoded_to_buf(p, tmp, utf_sequence_length);
+                p += utf_sequence_length;
+            }
+            else {
+                if (available - written < utf_sequence_length * 3) {
+                    return FLB_FALSE;
+                }
+
+                /*
+                 * Utf-8 sequence is invalid. Map fragments to private use area
+                 * codepoints in range:
+                 * 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>00 to
+                 * 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>FF
+                 */
+                for (b = 0; b < utf_sequence_length; ++b) {
+                    /*
+                     * Utf-8 private block invalid hex mapping. Format unicode charpoint
+                     * in the following format:
+                     *
+                     *      +--------+--------+--------+
+                     *      |1110PPPP|10PPPPHH|10HHHHHH|
+                     *      +--------+--------+--------+
+                     *
+                     * Where:
+                     *   P is FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR bits (1 byte)
+                     *   H is Utf-8 fragment hex bits (1 byte)
+                     *   1 is bit 1
+                     *   0 is bit 0
+                     */
+
+                    /* unicode codepoint start */
+                    *p = 0xE0;
+
+                    /* print unicode private block header first 4 bits */
+                    *p |= FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR >> 4;
+                    ++p;
+
+                    /* unicode codepoint middle */
+                    *p = 0x80;
+
+                    /* print end of unicode private block header last 4 bits */
+                    *p |= ((FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR << 2) & 0x3f);
+
+                    /* print hex fragment first 2 bits */
+                    *p |= (tmp[b] >> 6) & 0x03;
+                    ++p;
+
+                    /* unicode codepoint middle */
+                    *p = 0x80;
+
+                    /* print hex fragment last 6 bits */
+                    *p |= tmp[b] & 0x3f;
+                    ++p;
+                }
             }
-            encoded_to_buf(p, tmp, hex_bytes);
-            p += hex_bytes;
-            i += (hex_bytes - 1);
         }
         else {
             *p++ = c;

diff --git a/tests/internal/utils.c b/tests/internal/utils.c
@@ -16,6 +16,13 @@ struct url_check {
     char *uri;     /* expected uri      */
 };
 
+struct write_str_case {
+    char *input;
+    int input_len;
+    char *output;
+    int ret;
+};
+
 struct url_check url_checks[] = {
     {0, "https://fluentbit.io/something",
      "https", "fluentbit.io", "443", "/something"},
@@ -112,6 +119,54 @@ void test_url_split()
     }
 }
 
+/* test case loop for flb_utils_write_str */
+static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size);
+static void write_str_test_cases(struct write_str_case *cases) {
+    write_str_test_cases_w_buf_size(cases, 100);
+}
+
+/* test case loop for flb_utils_write_str */
+static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size) {
+    char *buf = flb_calloc(buf_size + 1, sizeof(char));
+    int size = buf_size + 1;
+    int off;
+    int ret;
+
+    struct write_str_case *tcase = cases;
+    while (!(tcase->input == 0 && tcase->output == 0)) {
+        memset(buf, 0, size);
+        off = 0;
+        ret = flb_utils_write_str(buf, &off, buf_size, tcase->input, tcase->input_len);
+
+        if(!TEST_CHECK(ret == tcase->ret)) {
+            TEST_MSG("Input string: %s", tcase->input);
+            TEST_MSG("| Expected return value: %s", (tcase->ret == FLB_TRUE) ? "FLB_TRUE"
+            : "FLB_FALSE");
+            TEST_MSG("| Produced return value: %s", (ret == FLB_TRUE) ? "FLB_TRUE"
+            : "FLB_FALSE");
+        }
+        if(!TEST_CHECK(memcmp(buf, tcase->output, off) == 0)) {
+            TEST_MSG("Input string: %s", tcase->input);
+            TEST_MSG("| Expected output: %s", tcase->output);
+            TEST_MSG("| Produced output: %s", buf);
+        }
+        if (!TEST_CHECK(strlen(buf) == strlen(tcase->output))) {
+            TEST_MSG("Input string: %s", tcase->input);
+            TEST_MSG("| Expected length: %zu", strlen(tcase->output));
+            TEST_MSG("| Produced length: %zu", strlen(buf));
+            TEST_MSG("| Expected output: %s", tcase->output);
+            TEST_MSG("| Produced output: %s", buf);
+        }
+        if (!TEST_CHECK(buf[size-1] == 0)) {
+            TEST_MSG("Out buffer overwrite detected '%c'", buf[size-1]);
+        }
+
+        ++tcase;
+    }
+
+    flb_free(buf);
+}
+
 void test_write_str()
 {
     char buf[10];
@@ -147,6 +202,162 @@ void test_write_str()
     TEST_CHECK(ret == FLB_FALSE);
 }
 
+void test_write_str_invalid_trailing_bytes()
+{
+    struct write_str_case cases[] = {
+        /* Invalid unicode (one bad trailing bytes) */
+        {
+            "\xe3\x81\x01""abc", 6,  /* note that 0x01 is an invalid byte */
+            "\xee\x83\xa3" /* e3 fragment */ /* replace invalid unicode */
+            "\xee\x82\x81" /* 81 fragment */
+            "\\u0001abc",
+            FLB_TRUE
+        },
+        /*
+         * Invalid unicode (two bad trailing bytes)
+         */
+        {
+            "\xe3\x01\x01""abc", 6,
+            "\xee\x83\xa3" /* e3 fragment */
+            "\\u0001\\u0001abc",
+            FLB_TRUE
+        },
+        { 0 }
+    };
+
+    write_str_test_cases(cases);
+}
+
+void test_write_str_invalid_leading_byte()
+{
+
+    struct write_str_case cases[] = {
+        /*
+         * Escaped leading hex (two hex, one valid unicode)
+         */
+        {
+            "\x00\x01\xe3\x81\x82""abc", 8,  /* note that 0x01 is an invalid byte */
+            "\\u0000\\u0001""\xe3\x81\x82""abc",  /* escape hex */
+            FLB_TRUE
+        },
+        /*
+         * Invalid unicode fragment (two byte fragment)
+         * note that 0xf3 is a leading byte with 3 trailing bytes. note that 0xe3 is also a
+         * leading byte with 2 trailing bytes. This should not be consumed by 0xf3 invalid
+         * unicode character
+         */
+        {
+            "\xf3\x81\x81\xe3\x81\x82""abc", 9,  /* note that 0xf3 0x81 0x81 is an invalid fragment */
+            "\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
+            "\xee\x82\x81" /* 81 fragment */
+            "\xee\x82\x81" /* 81 fragment */
+            "\xe3\x81\x82""abc", /* valid unicode */
+            FLB_TRUE
+        },
+        /*
+         * Invalid unicode (one bad leading byte + one bad trailing byte)
+         * note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte
+         */
+        {
+            "\xf3\x81\x01\xe3\x81\x82""abc", 9,  /* note that 0x01 is an invalid byte */
+            "\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
+            "\xee\x82\x81" /* 81 fragment */
+            "\\u0001""\xe3\x81\x82""abc",
+            FLB_TRUE
+        },
+        { 0 }
+    };
+
+    write_str_test_cases(cases);
+}
+
+void test_write_str_invalid_leading_byte_case_2()
+{
+
+    struct write_str_case cases[] = {
+        /* Invalid leading bytes */
+        {
+            "\x81\x82""abc", 5,  /* note that 0x81 & 0x82 are invalid leading bytes */
+            "\xee\x82\x81" /* 81 fragment */ /* replace invalid unicode */
+            "\xee\x82\x82" /* 82 fragment */
+            "abc",
+            FLB_TRUE
+        },
+        /*
+         * Invalid unicode (one bad leading byte + one bad trailing byte + one bad leading byte)
+         * note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte
+         * 0x81 & 0x82 are invalid leading bytes
+         */
+        {
+            "\xf3\x81\x01\x81\x82""abc", 8,  /* note that 0x81 & 0x82 are invalid leading bytes */
+            "\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
+            "\xee\x82\x81" /* 81 fragment */
+            "\\u0001"      /* 0x01 hex escape */
+            "\xee\x82\x81" /* 81 fragment */
+            "\xee\x82\x82" /* 82 fragment */
+            "abc",
+            FLB_TRUE
+        },
+        { 0 }
+    };
+
+    write_str_test_cases(cases);
+}
+
+void test_write_str_edge_cases()
+{
+    struct write_str_case cases[] = {
+        /* Invalid unicode (one bad leading byte) */
+        {
+            "\xf3", 1,  /* will this buffer overrun? */
+            "",  /* discard invalid unicode */
+            FLB_TRUE
+        },
+        { 0 }
+    };
+
+    write_str_test_cases(cases);
+}
+
+void test_write_str_buffer_overrun()
+{
+    struct write_str_case cases[] = {
+        {
+            "aa""\x81", 3,
+            "aa"
+            "\xee\x82\x81", /* just enough space for 81 fragment */
+            FLB_TRUE
+        },
+        {
+            "aaa""\x81", 4, /* out buffer size: 5, needed bytes: 2 + 3 + 3 = 8 */
+            "aaa",
+            /* "\xee\x82\x81", */ /* 81 fragment -- would overrun */
+            FLB_FALSE
+        },
+        {
+            "aaa"
+            "\xe3\x81\x82", 6, /* required is already grater than buffer */
+            "",
+            FLB_FALSE
+        },
+        {
+            "\""
+            "\xe3\x81\x82", 4, /* valid unicode */
+            "\\\"""\xe3\x81\x82", /* just enough space for valid unicode */
+            FLB_TRUE
+        },
+        {
+            "\x81"
+            "\xe3\x81\x82", 4, /* valid unicode */
+            "\xee\x82\x81", /* 81 fragment */
+            /* not enough space for valid unicode fragment "\xe3\x81\x82" */
+            FLB_FALSE
+        },
+        { 0 }
+    };
+    write_str_test_cases_w_buf_size(cases, 5);
+}
+
 struct proxy_url_check {
     int ret;
     char *url;        /* full URL          */
@@ -264,6 +475,11 @@ TEST_LIST = {
     /* JSON maps iteration */
     { "url_split", test_url_split },
     { "write_str", test_write_str },
+    { "test_write_str_invalid_trailing_bytes", test_write_str_invalid_trailing_bytes },
+    { "test_write_str_invalid_leading_byte", test_write_str_invalid_leading_byte },
+    { "test_write_str_edge_cases", test_write_str_edge_cases },
+    { "test_write_str_invalid_leading_byte_case_2", test_write_str_invalid_leading_byte_case_2 },
+    { "test_write_str_buffer_overrun", test_write_str_buffer_overrun },
     { "proxy_url_split", test_proxy_url_split },
     { 0 }
 };