diff --git a/src/flb_utils.c b/src/flb_utils.c index cefac754fdb..2ccd3a935f8 100644 --- a/src/flb_utils.c +++ b/src/flb_utils.c @@ -50,6 +50,14 @@ extern struct flb_aws_error_reporter *error_reporter; #define FLB_UTILS_REPLACE_FRAGMENT_START_LEN 31 #define FLB_UTILS_REPLACE_FRAGMENT_END_LEN 4 +/* + * The following block descriptor describes the private use unicode character range + * used for denoting invalid unicode fragments. Invalid fragment 0xCE would become + * utf-8 codepoint U+E0CE if FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR is set to + * E0 since U+E0CE = U+ + */ +#define FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR 0xE0 + static char *flb_utils_replace_fragment_start = "\xEF\xBF\xBD"" corrupted utf-8 sequence : "; static char *flb_utils_replace_fragment_end = " \xEF\xBF\xBD"; @@ -787,22 +795,49 @@ int flb_utils_write_str(char *buf, int *off, size_t size, p += utf_sequence_length; } else { - encoded_to_buf(p, flb_utils_replace_fragment_start, - FLB_UTILS_REPLACE_FRAGMENT_START_LEN); - p += FLB_UTILS_REPLACE_FRAGMENT_START_LEN; - - for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length; - ++utf_sequence_number) { - if (utf_sequence_number > 0) { - *p++ = ' '; - } - *p++ = int2hex[(tmp[utf_sequence_number] & 0xff) / 0x10]; - *p++ = int2hex[(tmp[utf_sequence_number] & 0xff) % 0x10]; + /* utf sequence is invalid. Print fragments out using private block + * codepoint range 0xE000 to 0xE0FF + */ + for (b = 0; b < utf_sequence_length; ++b) { + /* + * Utf-8 private block invalid hex formatting + * Format unicode charpoint in the following format: + * + * +--------+--------+--------+ + * |1110PPPP|10PPPPHH|10HHHHHH| + * +--------+--------+--------+ + * + * Where: + * P is FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR bits (1 byte) + * H is Hex fragment bits (1 byte) + * 1 is bit 1 + * 0 is bit 0 + */ + + /* unicode codepoint start */ + *p = 0xE0; + + /* print unicode private block header first 4 bits */ + *p |= FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR >> 4; + *p++; + + /* unicode codepoint middle */ + *p = 0x80; + + /* print end of unicode private block header last 4 bits */ + *p |= (FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR << 2) & 0x3f; + + /* print hex fragment first 2 bits */ + *p |= tmp[b] >> 6; + *p++; + + /* unicode codepoint middle */ + *p = 0x80; + + /* print hex fragment last 2 bits */ + *p |= tmp[b] & 0x3f; + *p++; } - - encoded_to_buf(p, flb_utils_replace_fragment_end, - FLB_UTILS_REPLACE_FRAGMENT_END_LEN); - p += FLB_UTILS_REPLACE_FRAGMENT_END_LEN; } } else { diff --git a/tests/internal/utils.c b/tests/internal/utils.c index 32e7223ae4f..a6f7a2a20d1 100644 --- a/tests/internal/utils.c +++ b/tests/internal/utils.c @@ -139,6 +139,13 @@ static void write_str_test_cases(struct write_str_case *cases) { TEST_MSG("| Expected output: %s", tcase->output); TEST_MSG("| Produced output: %s", buf); } + if (!TEST_CHECK(strlen(buf) == strlen(tcase->output))) { + TEST_MSG("Input string: %s", tcase->input); + TEST_MSG("| Expected length: %d", strlen(tcase->output)); + TEST_MSG("| Produced length: %d", strlen(buf)); + TEST_MSG("| Expected output: %s", tcase->output); + TEST_MSG("| Produced output: %s", buf); + } ++tcase; } } @@ -184,13 +191,13 @@ void test_write_str_invalid_trailing_bytes() /* Invalid unicode (one bad trailing byte) */ { "\xe3\x81\x01""abc", 6, /* note that 0x01 is an invalid byte */ - FLB_UTILS_TEST_FRAGMENT_START"e3 81"FLB_UTILS_TEST_FRAGMENT_END"\\u0001abc" /* replace invalid unicode */ + "\\u0001abc" /* replace invalid unicode */ }, /* Invalid unicode (two bad trailing byte) */ { "\xe3\x01\x01""abc", 6, - FLB_UTILS_TEST_FRAGMENT_START"e3"FLB_UTILS_TEST_FRAGMENT_END"\\u0001\\u0001abc" + "\\u0001\\u0001abc" }, { 0 } };