Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

flb_utils_write_str: detect and replace ill-formed utf-8 bytes -> master #4346

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 103 additions & 10 deletions src/flb_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ extern struct flb_aws_error_reporter *error_reporter;
#include <openssl/rand.h>
#endif

/*
* The following block descriptor describes the private use unicode character range
* used for denoting invalid utf-8 fragments. Invalid fragment 0xCE would become
* utf-8 codepoint U+E0CE if FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR is set to
* E0 since U+E0CE = U+<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR><HEX_FRAGMENT>
*/
#define FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR 0xE0

void flb_utils_error(int err)
{
char *msg = NULL;
Expand Down Expand Up @@ -636,6 +644,9 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
int required;
int len;
int hex_bytes;
int is_valid;
int utf_sequence_number;
int utf_sequence_length;
uint32_t codepoint;
uint32_t state = 0;
char tmp[16];
Expand Down Expand Up @@ -732,20 +743,102 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
i += (hex_bytes - 1);
}
else if (c > 0xFFFF) {
hex_bytes = flb_utf8_len(str + i);
if (available - written < 6) {
return FLB_FALSE;
}
utf_sequence_length = flb_utf8_len(str + i);

if (i + hex_bytes > str_len) {
if (i + utf_sequence_length > str_len) {
break; /* skip truncated UTF-8 */
}
for (b = 0; b < hex_bytes; b++) {
tmp[b] = str[i+b];

is_valid = FLB_TRUE;
for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length;
utf_sequence_number++) {
/* Leading characters must start with bits 11 */
if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) {
/* Invalid unicode character. replace */
flb_debug("[pack] unexpected UTF-8 leading byte, "
"substituting character with replacement character");
tmp[utf_sequence_number] = str[i];
++i; /* Consume invalid leading byte */
utf_sequence_length = utf_sequence_number + 1;
is_valid = FLB_FALSE;
break;
}
/* Trailing characters must start with bits 10 */
else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) {
/* Invalid unicode character. replace */
flb_debug("[pack] unexpected UTF-8 continuation byte, "
"substituting character with replacement character");
/* This byte, i, is the start of the next unicode character */
utf_sequence_length = utf_sequence_number;
is_valid = FLB_FALSE;
break;
}

tmp[utf_sequence_number] = str[i];
++i;
}
--i;

if (is_valid) {
if (available - written < utf_sequence_length) {
return FLB_FALSE;
}

encoded_to_buf(p, tmp, utf_sequence_length);
p += utf_sequence_length;
}
else {
if (available - written < utf_sequence_length * 3) {
return FLB_FALSE;
}

/*
* Utf-8 sequence is invalid. Map fragments to private use area
* codepoints in range:
* 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>00 to
* 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>FF
*/
for (b = 0; b < utf_sequence_length; ++b) {
/*
* Utf-8 private block invalid hex mapping. Format unicode charpoint
* in the following format:
*
* +--------+--------+--------+
* |1110PPPP|10PPPPHH|10HHHHHH|
* +--------+--------+--------+
*
* Where:
* P is FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR bits (1 byte)
* H is Utf-8 fragment hex bits (1 byte)
* 1 is bit 1
* 0 is bit 0
*/

/* unicode codepoint start */
*p = 0xE0;

/* print unicode private block header first 4 bits */
*p |= FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR >> 4;
++p;

/* unicode codepoint middle */
*p = 0x80;

/* print end of unicode private block header last 4 bits */
*p |= ((FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR << 2) & 0x3f);

/* print hex fragment first 2 bits */
*p |= (tmp[b] >> 6) & 0x03;
++p;

/* unicode codepoint middle */
*p = 0x80;

/* print hex fragment last 6 bits */
*p |= tmp[b] & 0x3f;
++p;
}
}
encoded_to_buf(p, tmp, hex_bytes);
p += hex_bytes;
i += (hex_bytes - 1);
}
else {
*p++ = c;
Expand Down
216 changes: 216 additions & 0 deletions tests/internal/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ struct url_check {
char *uri; /* expected uri */
};

struct write_str_case {
char *input;
int input_len;
char *output;
int ret;
};

struct url_check url_checks[] = {
{0, "https://fluentbit.io/something",
"https", "fluentbit.io", "443", "/something"},
Expand Down Expand Up @@ -112,6 +119,54 @@ void test_url_split()
}
}

/* test case loop for flb_utils_write_str */
static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size);
static void write_str_test_cases(struct write_str_case *cases) {
write_str_test_cases_w_buf_size(cases, 100);
}

/* test case loop for flb_utils_write_str */
static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size) {
char *buf = flb_calloc(buf_size + 1, sizeof(char));
int size = buf_size + 1;
int off;
int ret;

struct write_str_case *tcase = cases;
while (!(tcase->input == 0 && tcase->output == 0)) {
memset(buf, 0, size);
off = 0;
ret = flb_utils_write_str(buf, &off, buf_size, tcase->input, tcase->input_len);

if(!TEST_CHECK(ret == tcase->ret)) {
TEST_MSG("Input string: %s", tcase->input);
TEST_MSG("| Expected return value: %s", (tcase->ret == FLB_TRUE) ? "FLB_TRUE"
: "FLB_FALSE");
TEST_MSG("| Produced return value: %s", (ret == FLB_TRUE) ? "FLB_TRUE"
: "FLB_FALSE");
}
if(!TEST_CHECK(memcmp(buf, tcase->output, off) == 0)) {
TEST_MSG("Input string: %s", tcase->input);
TEST_MSG("| Expected output: %s", tcase->output);
TEST_MSG("| Produced output: %s", buf);
}
if (!TEST_CHECK(strlen(buf) == strlen(tcase->output))) {
TEST_MSG("Input string: %s", tcase->input);
TEST_MSG("| Expected length: %zu", strlen(tcase->output));
TEST_MSG("| Produced length: %zu", strlen(buf));
TEST_MSG("| Expected output: %s", tcase->output);
TEST_MSG("| Produced output: %s", buf);
}
if (!TEST_CHECK(buf[size-1] == 0)) {
TEST_MSG("Out buffer overwrite detected '%c'", buf[size-1]);
}

++tcase;
}

flb_free(buf);
}

void test_write_str()
{
char buf[10];
Expand Down Expand Up @@ -147,6 +202,162 @@ void test_write_str()
TEST_CHECK(ret == FLB_FALSE);
}

void test_write_str_invalid_trailing_bytes()
{
struct write_str_case cases[] = {
/* Invalid unicode (one bad trailing bytes) */
{
"\xe3\x81\x01""abc", 6, /* note that 0x01 is an invalid byte */
"\xee\x83\xa3" /* e3 fragment */ /* replace invalid unicode */
"\xee\x82\x81" /* 81 fragment */
"\\u0001abc",
FLB_TRUE
},
/*
* Invalid unicode (two bad trailing bytes)
*/
{
"\xe3\x01\x01""abc", 6,
"\xee\x83\xa3" /* e3 fragment */
"\\u0001\\u0001abc",
FLB_TRUE
},
{ 0 }
};

write_str_test_cases(cases);
}

void test_write_str_invalid_leading_byte()
{

struct write_str_case cases[] = {
/*
* Escaped leading hex (two hex, one valid unicode)
*/
{
"\x00\x01\xe3\x81\x82""abc", 8, /* note that 0x01 is an invalid byte */
"\\u0000\\u0001""\xe3\x81\x82""abc", /* escape hex */
FLB_TRUE
},
/*
* Invalid unicode fragment (two byte fragment)
* note that 0xf3 is a leading byte with 3 trailing bytes. note that 0xe3 is also a
* leading byte with 2 trailing bytes. This should not be consumed by 0xf3 invalid
* unicode character
*/
{
"\xf3\x81\x81\xe3\x81\x82""abc", 9, /* note that 0xf3 0x81 0x81 is an invalid fragment */
"\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
"\xee\x82\x81" /* 81 fragment */
"\xee\x82\x81" /* 81 fragment */
"\xe3\x81\x82""abc", /* valid unicode */
FLB_TRUE
},
/*
* Invalid unicode (one bad leading byte + one bad trailing byte)
* note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte
*/
{
"\xf3\x81\x01\xe3\x81\x82""abc", 9, /* note that 0x01 is an invalid byte */
"\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
"\xee\x82\x81" /* 81 fragment */
"\\u0001""\xe3\x81\x82""abc",
FLB_TRUE
},
{ 0 }
};

write_str_test_cases(cases);
}

void test_write_str_invalid_leading_byte_case_2()
{

struct write_str_case cases[] = {
/* Invalid leading bytes */
{
"\x81\x82""abc", 5, /* note that 0x81 & 0x82 are invalid leading bytes */
"\xee\x82\x81" /* 81 fragment */ /* replace invalid unicode */
"\xee\x82\x82" /* 82 fragment */
"abc",
FLB_TRUE
},
/*
* Invalid unicode (one bad leading byte + one bad trailing byte + one bad leading byte)
* note that 0xf3 is a leading byte with 3 trailing bytes. 0x01 is an invalid byte
* 0x81 & 0x82 are invalid leading bytes
*/
{
"\xf3\x81\x01\x81\x82""abc", 8, /* note that 0x81 & 0x82 are invalid leading bytes */
"\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */
"\xee\x82\x81" /* 81 fragment */
"\\u0001" /* 0x01 hex escape */
"\xee\x82\x81" /* 81 fragment */
"\xee\x82\x82" /* 82 fragment */
"abc",
FLB_TRUE
},
{ 0 }
};

write_str_test_cases(cases);
}

void test_write_str_edge_cases()
{
struct write_str_case cases[] = {
/* Invalid unicode (one bad leading byte) */
{
"\xf3", 1, /* will this buffer overrun? */
"", /* discard invalid unicode */
FLB_TRUE
},
{ 0 }
};

write_str_test_cases(cases);
}

void test_write_str_buffer_overrun()
{
struct write_str_case cases[] = {
{
"aa""\x81", 3,
"aa"
"\xee\x82\x81", /* just enough space for 81 fragment */
FLB_TRUE
},
{
"aaa""\x81", 4, /* out buffer size: 5, needed bytes: 2 + 3 + 3 = 8 */
"aaa",
/* "\xee\x82\x81", */ /* 81 fragment -- would overrun */
FLB_FALSE
},
{
"aaa"
"\xe3\x81\x82", 6, /* required is already grater than buffer */
"",
FLB_FALSE
},
{
"\""
"\xe3\x81\x82", 4, /* valid unicode */
"\\\"""\xe3\x81\x82", /* just enough space for valid unicode */
FLB_TRUE
},
{
"\x81"
"\xe3\x81\x82", 4, /* valid unicode */
"\xee\x82\x81", /* 81 fragment */
/* not enough space for valid unicode fragment "\xe3\x81\x82" */
FLB_FALSE
},
{ 0 }
};
write_str_test_cases_w_buf_size(cases, 5);
}

struct proxy_url_check {
int ret;
char *url; /* full URL */
Expand Down Expand Up @@ -264,6 +475,11 @@ TEST_LIST = {
/* JSON maps iteration */
{ "url_split", test_url_split },
{ "write_str", test_write_str },
{ "test_write_str_invalid_trailing_bytes", test_write_str_invalid_trailing_bytes },
{ "test_write_str_invalid_leading_byte", test_write_str_invalid_leading_byte },
{ "test_write_str_edge_cases", test_write_str_edge_cases },
{ "test_write_str_invalid_leading_byte_case_2", test_write_str_invalid_leading_byte_case_2 },
{ "test_write_str_buffer_overrun", test_write_str_buffer_overrun },
{ "proxy_url_split", test_proxy_url_split },
{ 0 }
};