Skip to content

Commit

Permalink
Fix loop when wrapping line with ISO-8859-1 character
Browse files Browse the repository at this point in the history
Changes utf8_char_length(), utf8_to_unicode() and utf8_length()
implementation to rely on utf8proc.

Fixes jonas#1087
  • Loading branch information
koutcher committed Feb 27, 2021
1 parent 9fb0a2f commit bfbbb20
Showing 1 changed file with 17 additions and 72 deletions.
89 changes: 17 additions & 72 deletions src/string.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,72 +214,24 @@ unicode_width(unsigned long c, int tab_size)

/* Number of bytes used for encoding a UTF-8 character indexed by first byte.
* Illegal bytes are set one. */
static const unsigned char utf8_bytes[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
};

unsigned char
utf8_char_length(const char *string)
{
int c = *(unsigned char *) string;
size_t c = *(unsigned char *) string;

return utf8_bytes[c];
return utf8proc_utf8class[c] ? utf8proc_utf8class[c] : 1;
}

/* Decode UTF-8 multi-byte representation into a Unicode character. */
unsigned long
utf8_to_unicode(const char *string, size_t length)
{
unsigned long unicode;

switch (length) {
case 1:
unicode = string[0];
break;
case 2:
unicode = (string[0] & 0x1f) << 6;
unicode += (string[1] & 0x3f);
break;
case 3:
unicode = (string[0] & 0x0f) << 12;
unicode += ((string[1] & 0x3f) << 6);
unicode += (string[2] & 0x3f);
break;
case 4:
unicode = (string[0] & 0x0f) << 18;
unicode += ((string[1] & 0x3f) << 12);
unicode += ((string[2] & 0x3f) << 6);
unicode += (string[3] & 0x3f);
break;
case 5:
unicode = (string[0] & 0x0f) << 24;
unicode += ((string[1] & 0x3f) << 18);
unicode += ((string[2] & 0x3f) << 12);
unicode += ((string[3] & 0x3f) << 6);
unicode += (string[4] & 0x3f);
break;
case 6:
unicode = (string[0] & 0x01) << 30;
unicode += ((string[1] & 0x3f) << 24);
unicode += ((string[2] & 0x3f) << 18);
unicode += ((string[3] & 0x3f) << 12);
unicode += ((string[4] & 0x3f) << 6);
unicode += (string[5] & 0x3f);
break;
default:
return 0;
}
utf8proc_int32_t unicode;
utf8proc_ssize_t slen = utf8proc_iterate((const utf8proc_uint8_t *) string, length, &unicode);

/* Invalid characters could return the special 0xfffd value but NUL
* should be just as good. */
return unicode > 0x10FFFF ? 0 : unicode;
return slen <= 0 || unicode < 0 ? 0 : unicode;
}

/* Calculates how much of string can be shown within the given maximum width
Expand All @@ -293,30 +245,23 @@ utf8_length(const char **start, int max_chars, size_t skip, int *width, size_t m
{
const char *string = *start;
const char *end = max_chars < 0 ? strchr(string, '\0') : string + max_chars;
unsigned char last_bytes = 0;
size_t last_ucwidth = 0;
utf8proc_ssize_t last_bytes = 0;
int last_ucwidth = 0;

*width = 0;
*trimmed = 0;

while (string < end) {
unsigned char bytes = utf8_char_length(string);
size_t ucwidth;
unsigned long unicode;

if (string + bytes > end)
break;

/* Change representation to figure out whether
* it is a single- or double-width character. */

unicode = utf8_to_unicode(string, bytes);
/* FIXME: Graceful handling of invalid Unicode character. */
if (!unicode)
break;

ucwidth = unicode == '\t' ? tab_size - (*width % tab_size) :
utf8proc_charwidth((utf8proc_int32_t) unicode);
utf8proc_int32_t unicode;
utf8proc_ssize_t bytes = utf8proc_iterate((const utf8proc_uint8_t *) string, end - string, &unicode);
int ucwidth;

/* Assume a width of 1 for invalid UTF-8 encoding (could be ISO-8859-1). */
if (bytes <= 0 || unicode < 0)
ucwidth = bytes = 1;
else
ucwidth = unicode == '\t' ? tab_size - (*width % tab_size) :
utf8proc_charwidth(unicode);
if (skip > 0) {
skip -= ucwidth <= skip ? ucwidth : skip;
*start += bytes;
Expand Down

0 comments on commit bfbbb20

Please sign in to comment.