diff --git a/src/flb_utils.c b/src/flb_utils.c index e05a1dec625..a2be44b1326 100644 --- a/src/flb_utils.c +++ b/src/flb_utils.c @@ -47,6 +47,8 @@ extern struct flb_aws_error_reporter *error_reporter; #include #endif +static char *flb_utils_replacement_character = "\xEF\xBF\xBD"; + void flb_utils_error(int err) { char *msg = NULL; @@ -744,23 +746,38 @@ int flb_utils_write_str(char *buf, int *off, size_t size, is_valid = FLB_TRUE; for (b = 0; b < hex_bytes; b++) { - /* Trailing characters must start with bit 1 */ - if (b > 0 && !(str[i+b] & 0x80)) { - /* Invalid unicode character. skip */ - flb_warn("[pack] unexpected UTF-8 continuation byte, " - "omitting a character"); + /* Leading characters must start with bits 11 */ + if (b == 0 && ((str[i] & 0xC0) != 0xC0)) { + /* Invalid unicode character. replace */ + flb_debug("[pack] unexpected UTF-8 leading byte, " + "substituting character with replacement character"); + ++i; /* Consume invalid leading byte */ + is_valid = FLB_FALSE; + break; + } + /* Trailing characters must start with bits 10 */ + else if (b > 0 && ((str[i] & 0xC0) != 0x80)) { + /* Invalid unicode character. replace */ + flb_debug("[pack] unexpected UTF-8 continuation byte, " + "substituting character with replacement character"); + /* This byte, i, is the start of the next unicode character */ is_valid = FLB_FALSE; break; } - tmp[b] = str[i+b]; + tmp[b] = str[i]; + ++i; } + --i; if (is_valid) { encoded_to_buf(p, tmp, hex_bytes); p += hex_bytes; } - i += (hex_bytes - 1); + else { + encoded_to_buf(p, flb_utils_replacement_character, 3); + p += 3; + } } else { *p++ = c; diff --git a/tests/internal/utils.c b/tests/internal/utils.c index 3e02b74362f..37334887734 100644 --- a/tests/internal/utils.c +++ b/tests/internal/utils.c @@ -16,6 +16,12 @@ struct url_check { char *uri; /* expected uri */ }; +struct write_str_case { + char *input; + int input_len; + char *output; +}; + struct url_check url_checks[] = { {0, "https://fluentbit.io/something", "https", "fluentbit.io", "443", "/something"}, @@ -112,6 +118,28 @@ void test_url_split() } } +/* case strings must be null terminated */ +static void write_str_test_cases(struct write_str_case *cases) { + char buf[100] = {0}; + int size = sizeof(buf); + int off; + int ret; + + struct write_str_case *tcase = cases; + while (!(tcase->input == 0 && tcase->output == 0)) { + memset(buf, 0, size); + off = 0; + ret = flb_utils_write_str(buf, &off, size, tcase->input, tcase->input_len); + TEST_CHECK(ret == FLB_TRUE); + if(!TEST_CHECK(memcmp(buf, tcase->output, off) == 0)) { + TEST_MSG("Input string: %s", tcase->input); + TEST_MSG("| Expected output: %s", tcase->output); + TEST_MSG("| Produced output: %s", buf); + } + ++tcase; + } +} + void test_write_str() { char buf[10]; @@ -147,6 +175,101 @@ void test_write_str() TEST_CHECK(ret == FLB_FALSE); } +void test_write_str_invalid_trailing_bytes() +{ + struct write_str_case cases[] = { + /* Invalid unicode (one bad trailing byte) */ + { + "\xe3\x81\x01""abc", 6, /* note that 0x01 is an invalid byte */ + "�""\\u0001abc" /* replace invalid unicode */ + }, + + /* Invalid unicode (two bad trailing byte) */ + { + "\xe3\x01\x01""abc", 6, + "�""\\u0001\\u0001abc" + }, + { 0 } + }; + + write_str_test_cases(cases); +} + +void test_write_str_invalid_leading_byte() +{ + + struct write_str_case cases[] = { + /* + * Escaped leading hex (two hex, one valid unicode) + */ + { + "\x00\x01\xe3\x81\x82""abc", 8, /* note that 0x01 is an invalid byte */ + "\\u0000\\u0001""\xe3\x81\x82""abc" /* escape hex */ + }, + /* + * Invalid unicode fragment (two byte fragment) + * note that 0xf3 is a leading byte with 3 trailing bytes. note that 0xe3 is also a + * leading byte with 2 trailing bytes. This should not be consumed by 0xf3 invalid + * unicode character + */ + { + "\xf3\x81\x81\xe3\x81\x82""abc", 9, /* note that 0xf3 0x81 0x81 is an invalid fragment */ + "�""\xe3\x81\x82""abc" /* replace invalid unicode */ + }, + + /* + * Invalid unicode (one bad leading byte + one bad trailing byte) + * note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte + */ + { + "\xf3\x81\x01\xe3\x81\x82""abc", 9, /* note that 0x01 is an invalid byte */ + "�""\\u0001""\xe3\x81\x82""abc" /* replace invalid unicode */ + }, + { 0 } + }; + + write_str_test_cases(cases); +} + +void test_write_str_invalid_leading_byte_case_2() +{ + + struct write_str_case cases[] = { + /* Invalid leading bytes */ + { + "\x81\x82""abc", 5, /* note that 0x81 & 0x82 are invalid leading bytes */ + "��abc" /* replace invalid unicode */ + }, + + /* + * Invalid unicode (one bad leading byte + 1 bad trailing byte + 1 bad leading B) + * note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte + * 0x81 & 0x82 are invalid leading bytes + */ + { + "\xf3\x81\x01\x81\x82""abc", 8, /* note that 0x01 is an invalid byte */ + "�\\u0001��abc" /* replace invalid unicode */ + }, + { 0 } + }; + + write_str_test_cases(cases); +} + +void test_write_str_edge_cases() +{ + struct write_str_case cases[] = { + /* Invalid unicode (one bad leading byte) */ + { + "\xe3", 1, /* will this buffer overrun? */ + "" /* discard invalid unicode */ + }, + { 0 } + }; + + write_str_test_cases(cases); +} + struct proxy_url_check { int ret; char *url; /* full URL */ @@ -264,6 +387,10 @@ TEST_LIST = { /* JSON maps iteration */ { "url_split", test_url_split }, { "write_str", test_write_str }, + { "test_write_str_invalid_trailing_bytes", test_write_str_invalid_trailing_bytes }, + { "test_write_str_invalid_leading_byte", test_write_str_invalid_leading_byte }, + { "test_write_str_edge_cases", test_write_str_edge_cases }, + { "test_write_str_invalid_leading_byte_case_2", test_write_str_invalid_leading_byte_case_2 }, { "proxy_url_split", test_proxy_url_split }, { 0 } };