Skip to content

Commit

Permalink
utils: detect and replace ill-formed utf-8 bytes (#4346)
Browse files Browse the repository at this point in the history
Previously with unicode byte sequences such as

   0xef 0xbf 0x00 ...

Fluent Bit would blindly trust the first unicode byte 0xef to describe
how many valid trailing unicode bytes to copy.
If a trailing unicode byte is invalid, such as 0x00, the null character,
the utility blindly copied this to the escaped string.

This commit adds checks for leading and trailing byte utf-8 compliance.
If invalid, the ill-formed character's bytes are individually mapped to
private use area [U+E000 to U+E0FF] preserving ill-formed character data
in a compact and safe utf-8 friendly format.

Signed-off-by: Matthew Fala <[email protected]>
  • Loading branch information
matthewfala authored Nov 29, 2021
1 parent bf0f0d2 commit 861af37
Show file tree
Hide file tree
Showing 2 changed files with 319 additions and 10 deletions.
113 changes: 103 additions & 10 deletions src/flb_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ extern struct flb_aws_error_reporter *error_reporter;
#include <openssl/rand.h>
#endif

/*
* The following block descriptor describes the private use unicode character range
* used for denoting invalid utf-8 fragments. Invalid fragment 0xCE would become
* utf-8 codepoint U+E0CE if FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR is set to
* E0 since U+E0CE = U+<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR><HEX_FRAGMENT>
*/
#define FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR 0xE0

void flb_utils_error(int err)
{
char *msg = NULL;
Expand Down Expand Up @@ -636,6 +644,9 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
int required;
int len;
int hex_bytes;
int is_valid;
int utf_sequence_number;
int utf_sequence_length;
uint32_t codepoint;
uint32_t state = 0;
char tmp[16];
Expand Down Expand Up @@ -732,20 +743,102 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
i += (hex_bytes - 1);
}
else if (c > 0xFFFF) {
hex_bytes = flb_utf8_len(str + i);
if (available - written < 6) {
return FLB_FALSE;
}
utf_sequence_length = flb_utf8_len(str + i);

if (i + hex_bytes > str_len) {
if (i + utf_sequence_length > str_len) {
break; /* skip truncated UTF-8 */
}
for (b = 0; b < hex_bytes; b++) {
tmp[b] = str[i+b];

is_valid = FLB_TRUE;
for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length;
utf_sequence_number++) {
/* Leading characters must start with bits 11 */
if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) {
/* Invalid unicode character. replace */
flb_debug("[pack] unexpected UTF-8 leading byte, "
"substituting character with replacement character");
tmp[utf_sequence_number] = str[i];
++i; /* Consume invalid leading byte */
utf_sequence_length = utf_sequence_number + 1;
is_valid = FLB_FALSE;
break;
}
/* Trailing characters must start with bits 10 */
else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) {
/* Invalid unicode character. replace */
flb_debug("[pack] unexpected UTF-8 continuation byte, "
"substituting character with replacement character");
/* This byte, i, is the start of the next unicode character */
utf_sequence_length = utf_sequence_number;
is_valid = FLB_FALSE;
break;
}

tmp[utf_sequence_number] = str[i];
++i;
}
--i;

if (is_valid) {
if (available - written < utf_sequence_length) {
return FLB_FALSE;
}

encoded_to_buf(p, tmp, utf_sequence_length);
p += utf_sequence_length;
}
else {
if (available - written < utf_sequence_length * 3) {
return FLB_FALSE;
}

/*
* Utf-8 sequence is invalid. Map fragments to private use area
* codepoints in range:
* 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>00 to
* 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>FF
*/
for (b = 0; b < utf_sequence_length; ++b) {
/*
* Utf-8 private block invalid hex mapping. Format unicode charpoint
* in the following format:
*
* +--------+--------+--------+
* |1110PPPP|10PPPPHH|10HHHHHH|
* +--------+--------+--------+
*
* Where:
* P is FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR bits (1 byte)
* H is Utf-8 fragment hex bits (1 byte)
* 1 is bit 1
* 0 is bit 0
*/

/* unicode codepoint start */
*p = 0xE0;

/* print unicode private block header first 4 bits */
*p |= FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR >> 4;
++p;

/* unicode codepoint middle */
*p = 0x80;

/* print end of unicode private block header last 4 bits */
*p |= ((FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR << 2) & 0x3f);

/* print hex fragment first 2 bits */
*p |= (tmp[b] >> 6) & 0x03;
++p;

/* unicode codepoint middle */
*p = 0x80;

/* print hex fragment last 6 bits */
*p |= tmp[b] & 0x3f;
++p;
}
}
encoded_to_buf(p, tmp, hex_bytes);
p += hex_bytes;
i += (hex_bytes - 1);
}
else {
*p++ = c;
Expand Down
216 changes: 216 additions & 0 deletions tests/internal/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ struct url_check {
char *uri; /* expected uri */
};

struct write_str_case {
char *input;
int input_len;
char *output;
int ret;
};

struct url_check url_checks[] = {
{0, "https://fluentbit.io/something",
"https", "fluentbit.io", "443", "/something"},
Expand Down Expand Up @@ -112,6 +119,54 @@ void test_url_split()
}
}

/* test case loop for flb_utils_write_str */
static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size);
static void write_str_test_cases(struct write_str_case *cases) {
write_str_test_cases_w_buf_size(cases, 100);
}

/* test case loop for flb_utils_write_str */
static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size) {
char *buf = flb_calloc(buf_size + 1, sizeof(char));
int size = buf_size + 1;
int off;
int ret;

struct write_str_case *tcase = cases;
while (!(tcase->input == 0 && tcase->output == 0)) {
memset(buf, 0, size);
off = 0;
ret = flb_utils_write_str(buf, &off, buf_size, tcase->input, tcase->input_len);

if(!TEST_CHECK(ret == tcase->ret)) {
TEST_MSG("Input string: %s", tcase->input);
TEST_MSG("| Expected return value: %s", (tcase->ret == FLB_TRUE) ? "FLB_TRUE"
: "FLB_FALSE");
TEST_MSG("| Produced return value: %s", (ret == FLB_TRUE) ? "FLB_TRUE"
: "FLB_FALSE");
}
if(!TEST_CHECK(memcmp(buf, tcase->output, off) == 0)) {
TEST_MSG("Input string: %s", tcase->input);
TEST_MSG("| Expected output: %s", tcase->output);
TEST_MSG("| Produced output: %s", buf);
}
if (!TEST_CHECK(strlen(buf) == strlen(tcase->output))) {
TEST_MSG("Input string: %s", tcase->input);
TEST_MSG("| Expected length: %zu", strlen(tcase->output));
TEST_MSG("| Produced length: %zu", strlen(buf));
TEST_MSG("| Expected output: %s", tcase->output);
TEST_MSG("| Produced output: %s", buf);
}
if (!TEST_CHECK(buf[size-1] == 0)) {
TEST_MSG("Out buffer overwrite detected '%c'", buf[size-1]);
}

++tcase;
}

flb_free(buf);
}

void test_write_str()
{
char buf[10];
Expand Down Expand Up @@ -147,6 +202,162 @@ void test_write_str()
TEST_CHECK(ret == FLB_FALSE);
}

void test_write_str_invalid_trailing_bytes()
{
struct write_str_case cases[] = {
/* Invalid unicode (one bad trailing bytes) */
{
"\xe3\x81\x01""abc", 6, /* note that 0x01 is an invalid byte */
"\xee\x83\xa3" /* e3 fragment */ /* replace invalid unicode */
"\xee\x82\x81" /* 81 fragment */
"\\u0001abc",
FLB_TRUE
},
/*
* Invalid unicode (two bad trailing bytes)
*/
{
"\xe3\x01\x01""abc", 6,
"\xee\x83\xa3" /* e3 fragment */
"\\u0001\\u0001abc",
FLB_TRUE
},
{ 0 }
};

write_str_test_cases(cases);
}

void test_write_str_invalid_leading_byte()
{

struct write_str_case cases[] = {
/*
* Escaped leading hex (two hex, one valid unicode)
*/
{
"\x00\x01\xe3\x81\x82""abc", 8, /* note that 0x01 is an invalid byte */
"\\u0000\\u0001""\xe3\x81\x82""abc", /* escape hex */
FLB_TRUE
},
/*
* Invalid unicode fragment (two byte fragment)
* note that 0xf3 is a leading byte with 3 trailing bytes. note that 0xe3 is also a
* leading byte with 2 trailing bytes. This should not be consumed by 0xf3 invalid
* unicode character
*/
{
"\xf3\x81\x81\xe3\x81\x82""abc", 9, /* note that 0xf3 0x81 0x81 is an invalid fragment */
"\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
"\xee\x82\x81" /* 81 fragment */
"\xee\x82\x81" /* 81 fragment */
"\xe3\x81\x82""abc", /* valid unicode */
FLB_TRUE
},
/*
* Invalid unicode (one bad leading byte + one bad trailing byte)
* note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte
*/
{
"\xf3\x81\x01\xe3\x81\x82""abc", 9, /* note that 0x01 is an invalid byte */
"\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
"\xee\x82\x81" /* 81 fragment */
"\\u0001""\xe3\x81\x82""abc",
FLB_TRUE
},
{ 0 }
};

write_str_test_cases(cases);
}

void test_write_str_invalid_leading_byte_case_2()
{

struct write_str_case cases[] = {
/* Invalid leading bytes */
{
"\x81\x82""abc", 5, /* note that 0x81 & 0x82 are invalid leading bytes */
"\xee\x82\x81" /* 81 fragment */ /* replace invalid unicode */
"\xee\x82\x82" /* 82 fragment */
"abc",
FLB_TRUE
},
/*
* Invalid unicode (one bad leading byte + one bad trailing byte + one bad leading byte)
* note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte
* 0x81 & 0x82 are invalid leading bytes
*/
{
"\xf3\x81\x01\x81\x82""abc", 8, /* note that 0x81 & 0x82 are invalid leading bytes */
"\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
"\xee\x82\x81" /* 81 fragment */
"\\u0001" /* 0x01 hex escape */
"\xee\x82\x81" /* 81 fragment */
"\xee\x82\x82" /* 82 fragment */
"abc",
FLB_TRUE
},
{ 0 }
};

write_str_test_cases(cases);
}

void test_write_str_edge_cases()
{
struct write_str_case cases[] = {
/* Invalid unicode (one bad leading byte) */
{
"\xf3", 1, /* will this buffer overrun? */
"", /* discard invalid unicode */
FLB_TRUE
},
{ 0 }
};

write_str_test_cases(cases);
}

void test_write_str_buffer_overrun()
{
struct write_str_case cases[] = {
{
"aa""\x81", 3,
"aa"
"\xee\x82\x81", /* just enough space for 81 fragment */
FLB_TRUE
},
{
"aaa""\x81", 4, /* out buffer size: 5, needed bytes: 2 + 3 + 3 = 8 */
"aaa",
/* "\xee\x82\x81", */ /* 81 fragment -- would overrun */
FLB_FALSE
},
{
"aaa"
"\xe3\x81\x82", 6, /* required is already grater than buffer */
"",
FLB_FALSE
},
{
"\""
"\xe3\x81\x82", 4, /* valid unicode */
"\\\"""\xe3\x81\x82", /* just enough space for valid unicode */
FLB_TRUE
},
{
"\x81"
"\xe3\x81\x82", 4, /* valid unicode */
"\xee\x82\x81", /* 81 fragment */
/* not enough space for valid unicode fragment "\xe3\x81\x82" */
FLB_FALSE
},
{ 0 }
};
write_str_test_cases_w_buf_size(cases, 5);
}

struct proxy_url_check {
int ret;
char *url; /* full URL */
Expand Down Expand Up @@ -264,6 +475,11 @@ TEST_LIST = {
/* JSON maps iteration */
{ "url_split", test_url_split },
{ "write_str", test_write_str },
{ "test_write_str_invalid_trailing_bytes", test_write_str_invalid_trailing_bytes },
{ "test_write_str_invalid_leading_byte", test_write_str_invalid_leading_byte },
{ "test_write_str_edge_cases", test_write_str_edge_cases },
{ "test_write_str_invalid_leading_byte_case_2", test_write_str_invalid_leading_byte_case_2 },
{ "test_write_str_buffer_overrun", test_write_str_buffer_overrun },
{ "proxy_url_split", test_proxy_url_split },
{ 0 }
};

0 comments on commit 861af37

Please sign in to comment.