Skip to content

Commit

Permalink
FIX: iconv not accepting utf8 codepage on BSD systems
Browse files Browse the repository at this point in the history
  • Loading branch information
Oldes committed Jan 6, 2022
1 parent 5a8c6e0 commit 0c62ea4
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 101 deletions.
230 changes: 129 additions & 101 deletions src/core/u-iconv.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,21 +83,18 @@ static struct {
{12001, "UCS4BE"},
{12001, "CP12001"},

//#ifndef GLIB_COMPILATION
// /*
// * Default is big endian.
// * See rfc2781 4.3 Interpreting text labelled as UTF-16.
// */
// {1201, "UTF16"},
// {1201, "UTF-16"},
// {1201, "UCS2"},
// {1201, "UCS-2"},
// {12001, "UTF32"},
// {12001, "UTF-32"},
// {12001, "UCS-4"},
// {12001, "UCS4"},
//#else
/* Default is little endian, because the platform is */
#ifdef BIG_ENDIAN
// Big endian...
{1201, "UTF16"},
{1201, "UTF-16"},
{1201, "UCS2"},
{1201, "UCS-2"},
{12001, "UTF32"},
{12001, "UTF-32"},
{12001, "UCS-4"},
{12001, "UCS4"},
#else
// Little endian variants
{1200, "UTF-16"},
{1200, "UTF16"},
{1200, "UCS-2"},
Expand All @@ -106,7 +103,7 @@ static struct {
{12000, "UTF32"},
{12000, "UCS-4"},
{12000, "UCS4"},
//#endif
#endif

/* copy from libiconv `iconv -l` */
/* !IsValidCodePage(367) */
Expand Down Expand Up @@ -231,6 +228,7 @@ static struct {
{936, "WINDOWS-936"},

{950, "CP950"},
{950, "BIG-5"},
{950, "BIG5"},
{950, "BIG5HKSCS"},
{950, "BIG5-HKSCS"},
Expand Down Expand Up @@ -319,31 +317,32 @@ static struct {
/* 709 Arabic (ASMO-449+, BCON V4) */
/* 710 Arabic - Transparent Arabic */
{720, "DOS-720"}, /* Arabic (Transparent ASMO); Arabic (DOS) */
{737, "ibm737"}, /* OEM Greek (formerly 437G); Greek (DOS) */
{775, "ibm775"}, /* OEM Baltic; Baltic (DOS) */
{850, "ibm850"}, /* OEM Multilingual Latin 1; Western European (DOS) */
{852, "ibm852"}, /* OEM Latin 2; Central European (DOS) */
{737, "IBM737"}, /* OEM Greek (formerly 437G); Greek (DOS) */
{775, "IBM775"}, /* OEM Baltic; Baltic (DOS) */
{850, "IBM850"}, /* OEM Multilingual Latin 1; Western European (DOS) */
{852, "IBM852"}, /* OEM Latin 2; Central European (DOS) */
{855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */
{857, "ibm857"}, /* OEM Turkish; Turkish (DOS) */
{857, "IBM857"}, /* OEM Turkish; Turkish (DOS) */
{858, "IBM00858"}, /* OEM Multilingual Latin 1 + Euro symbol */
{860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */
{861, "ibm861"}, /* OEM Icelandic; Icelandic (DOS) */
{861, "IBM861"}, /* OEM Icelandic; Icelandic (DOS) */
{862, "DOS-862"}, /* OEM Hebrew; Hebrew (DOS) */
{863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */
{864, "IBM864"}, /* OEM Arabic; Arabic (864) */
{865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */
{866, "cp866"}, /* OEM Russian; Cyrillic (DOS) */
{869, "ibm869"}, /* OEM Modern Greek; Greek, Modern (DOS) */
{866, "CP866"}, /* OEM Russian; Cyrillic (DOS) */
{869, "IBM869"}, /* OEM Modern Greek; Greek, Modern (DOS) */
{870, "IBM870"}, /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */
{874, "windows-874"}, /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */
{875, "cp875"}, /* IBM EBCDIC Greek Modern */
{932, "shift_jis"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */
{932, "shift-jis"}, /* alternative name for it */
{936, "gb2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */
{949, "ks_c_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */
{950, "big5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */
{950, "big5hkscs"}, /* ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS) */
{950, "big5-hkscs"}, /* alternative name for it */
{874, "WINDOWS-874"}, /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */
{875, "CP875"}, /* IBM EBCDIC Greek Modern */
{932, "SHIFT_JIS"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */
{932, "SHIFT-JIS"}, /* alternative name for it */
{936, "GB2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */
{949, "KS_C_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */
{950, "BIG-5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */
{950, "BIG5"},
{950, "BIG5-HKSCS"}, /* ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS) */
{950, "BIG5HKSCS"}, /* alternative name for it */
{1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */
{1047, "IBM01047"}, /* IBM EBCDIC Latin 1/Open System */
{1140, "IBM01140"}, /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */
Expand All @@ -365,8 +364,8 @@ static struct {
{1256, "windows-1256"}, /* ANSI Arabic; Arabic (Windows) */
{1257, "windows-1257"}, /* ANSI Baltic; Baltic (Windows) */
{1258, "windows-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */
{1361, "Johab"}, /* Korean (Johab) */
{10000, "macintosh"}, /* MAC Roman; Western European (Mac) */
{1361, "JOHAB"}, /* Korean (Johab) */
{10000, "MACINTOSH"}, /* MAC Roman; Western European (Mac) */
{10001, "x-mac-japanese"}, /* Japanese (Mac) */
{10002, "x-mac-chinesetrad"}, /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */
{10003, "x-mac-korean"}, /* Korean (Mac) */
Expand All @@ -392,7 +391,7 @@ static struct {
{20106, "x-IA5-German"}, /* IA5 German (7-bit) */
{20107, "x-IA5-Swedish"}, /* IA5 Swedish (7-bit) */
{20108, "x-IA5-Norwegian"}, /* IA5 Norwegian (7-bit) */
{20127, "us-ascii"}, /* US-ASCII (7-bit) */
{20127, "US-ASCII"}, /* US-ASCII (7-bit) */
{20261, "x-cp20261"}, /* T.61 */
{20269, "x-cp20269"}, /* ISO 6937 Non-Spacing Accent */
{20273, "IBM273"}, /* IBM EBCDIC Germany */
Expand All @@ -408,71 +407,71 @@ static struct {
{20424, "IBM424"}, /* IBM EBCDIC Hebrew */
{20833, "x-EBCDIC-KoreanExtended"}, /* IBM EBCDIC Korean Extended */
{20838, "IBM-Thai"}, /* IBM EBCDIC Thai */
{20866, "koi8-r"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */
{20866, "KOI8-R"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */
{20871, "IBM871"}, /* IBM EBCDIC Icelandic */
{20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */
{20905, "IBM905"}, /* IBM EBCDIC Turkish */
{20924, "IBM00924"}, /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */
{20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0121-1990) */
{20936, "x-cp20936"}, /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */
{20949, "x-cp20949"}, /* Korean Wansung */
{21025, "cp1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */
{21025, "CP1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */
/* 21027 (deprecated) */
{21866, "koi8-u"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */
{28591, "iso-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */
{28591, "iso8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */
{28591, "iso_8859-1"},
{28591, "iso_8859_1"},
{28592, "iso-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */
{28592, "iso8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */
{28592, "iso_8859-2"},
{28592, "iso_8859_2"},
{28593, "iso-8859-3"}, /* ISO 8859-3 Latin 3 */
{28593, "iso8859-3"}, /* ISO 8859-3 Latin 3 */
{28593, "iso_8859-3"},
{28593, "iso_8859_3"},
{28594, "iso-8859-4"}, /* ISO 8859-4 Baltic */
{28594, "iso8859-4"}, /* ISO 8859-4 Baltic */
{28594, "iso_8859-4"},
{28594, "iso_8859_4"},
{28595, "iso-8859-5"}, /* ISO 8859-5 Cyrillic */
{28595, "iso8859-5"}, /* ISO 8859-5 Cyrillic */
{28595, "iso_8859-5"},
{28595, "iso_8859_5"},
{28596, "iso-8859-6"}, /* ISO 8859-6 Arabic */
{28596, "iso8859-6"}, /* ISO 8859-6 Arabic */
{28596, "iso_8859-6"},
{28596, "iso_8859_6"},
{28597, "iso-8859-7"}, /* ISO 8859-7 Greek */
{28597, "iso8859-7"}, /* ISO 8859-7 Greek */
{28597, "iso_8859-7"},
{28597, "iso_8859_7"},
{28598, "iso-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */
{28598, "iso8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */
{28598, "iso_8859-8"},
{28598, "iso_8859_8"},
{28599, "iso-8859-9"}, /* ISO 8859-9 Turkish */
{28599, "iso8859-9"}, /* ISO 8859-9 Turkish */
{28599, "iso_8859-9"},
{28599, "iso_8859_9"},
{28603, "iso-8859-13"}, /* ISO 8859-13 Estonian */
{28603, "iso8859-13"}, /* ISO 8859-13 Estonian */
{28603, "iso_8859-13"},
{28603, "iso_8859_13"},
{28605, "iso-8859-15"}, /* ISO 8859-15 Latin 9 */
{28605, "iso8859-15"}, /* ISO 8859-15 Latin 9 */
{28605, "iso_8859-15"},
{28605, "iso_8859_15"},
{21866, "KOI8-U"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */
{28591, "ISO-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */
{28591, "ISO8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */
{28591, "ISO_8859-1"},
{28591, "ISO_8859_1"},
{28592, "ISO-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */
{28592, "ISO8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */
{28592, "ISO_8859-2"},
{28592, "ISO_8859_2"},
{28593, "ISO-8859-3"}, /* ISO 8859-3 Latin 3 */
{28593, "ISO8859-3"}, /* ISO 8859-3 Latin 3 */
{28593, "ISO_8859-3"},
{28593, "ISO_8859_3"},
{28594, "ISO-8859-4"}, /* ISO 8859-4 Baltic */
{28594, "ISO8859-4"}, /* ISO 8859-4 Baltic */
{28594, "ISO_8859-4"},
{28594, "ISO_8859_4"},
{28595, "ISO-8859-5"}, /* ISO 8859-5 Cyrillic */
{28595, "ISO8859-5"}, /* ISO 8859-5 Cyrillic */
{28595, "ISO_8859-5"},
{28595, "ISO_8859_5"},
{28596, "ISO-8859-6"}, /* ISO 8859-6 Arabic */
{28596, "ISO8859-6"}, /* ISO 8859-6 Arabic */
{28596, "ISO_8859-6"},
{28596, "ISO_8859_6"},
{28597, "ISO-8859-7"}, /* ISO 8859-7 Greek */
{28597, "ISO8859-7"}, /* ISO 8859-7 Greek */
{28597, "ISO_8859-7"},
{28597, "ISO_8859_7"},
{28598, "ISO-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */
{28598, "ISO8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */
{28598, "ISO_8859-8"},
{28598, "ISO_8859_8"},
{28599, "ISO-8859-9"}, /* ISO 8859-9 Turkish */
{28599, "ISO8859-9"}, /* ISO 8859-9 Turkish */
{28599, "ISO_8859-9"},
{28599, "ISO_8859_9"},
{28603, "ISO-8859-13"}, /* ISO 8859-13 Estonian */
{28603, "ISO8859-13"}, /* ISO 8859-13 Estonian */
{28603, "ISO_8859-13"},
{28603, "ISO_8859_13"},
{28605, "ISO-8859-15"}, /* ISO 8859-15 Latin 9 */
{28605, "ISO8859-15"}, /* ISO 8859-15 Latin 9 */
{28605, "ISO_8859-15"},
{28605, "ISO_8859_15"},
{29001, "x-Europa"}, /* Europa 3 */
{38598, "iso-8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */
{38598, "iso8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */
{38598, "iso_8859-8-i"},
{38598, "iso_8859_8-i"},
{50220, "iso-2022-jp"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */
{38598, "ISO-8859-8-I"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */
{38598, "ISO8859-8-I"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */
{38598, "ISO_8859-8-I"},
{38598, "ISO_8859_8-I"},
{50220, "ISO-2022-JP"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */
{50221, "csISO2022JP"}, /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */
{50222, "iso-2022-jp"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */
{50225, "iso-2022-kr"}, /* ISO 2022 Korean */
{50225, "iso2022-kr"}, /* ISO 2022 Korean */
{50222, "ISO-2022-JP"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */
{50225, "ISO-2022-KR"}, /* ISO 2022 Korean */
{50225, "ISO2022-KR"}, /* ISO 2022 Korean */
{50227, "x-cp50227"}, /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */
/* 50229 ISO 2022 Traditional Chinese */
/* 50930 EBCDIC Japanese (Katakana) Extended */
Expand All @@ -482,11 +481,11 @@ static struct {
/* 50936 EBCDIC Simplified Chinese */
/* 50937 EBCDIC US-Canada and Traditional Chinese */
/* 50939 EBCDIC Japanese (Latin) Extended and Japanese */
{51932, "euc-jp"}, /* EUC Japanese */
{51932, "EUC-JP"}, /* EUC Japanese */
{51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */
{51949, "euc-kr"}, /* EUC Korean */
{51949, "EUC-KR"}, /* EUC Korean */
/* 51950 EUC Traditional Chinese */
{52936, "hz-gb-2312"}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */
{52936, "HZ-GB-2312"}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */
{54936, "GB18030"}, /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */
{57002, "x-iscii-de"}, /* ISCII Devanagari */
{57003, "x-iscii-be"}, /* ISCII Bengali */
Expand Down Expand Up @@ -523,7 +522,7 @@ static REBINT get_codepage_id(REBVAL *cp)
else if ('0' <= name[0] && name[0] <= '9')
return atoi(cs_cast(name)); /* 123 */

for (i = 0; codepage_alias[i].name != NULL; ++i)
for (i = 0; codepage_alias[i].name != NULL; i++)
if (_stricmp(cs_cast(name), codepage_alias[i].name) == 0)
return codepage_alias[i].codepage;
#else
Expand All @@ -532,9 +531,10 @@ static REBINT get_codepage_id(REBVAL *cp)
else if ('0' <= name[0] && name[0] <= '9')
return atoi(cs_cast(name)); /* 123 */

for (i = 0; codepage_alias[i].name != NULL; ++i)
for (i = 0; codepage_alias[i].name != NULL; i++) {
if (strcasecmp(cs_cast(name), codepage_alias[i].name) == 0)
return codepage_alias[i].codepage;
}
#endif
return -1;
}
Expand All @@ -554,7 +554,7 @@ static REBYTE* get_codepage_name(REBVAL *cp)

//printf("get_codepage_name from: %d\n", id);

for (i = 0; codepage_alias[i].name != NULL; ++i) {
for (i = 0; codepage_alias[i].name != NULL; i++) {
if (codepage_alias[i].codepage == id) {
// printf("found: %s\n", codepage_alias[i].name);
return (REBYTE*)codepage_alias[i].name;
Expand All @@ -577,6 +577,27 @@ static REBYTE* get_codepage_name(REBVAL *cp)
data[i] = (num>>8) | (num<<8);
}
}
#else
/***********************************************************************
** Some iconv versions accepts only some names!
** For example on BSD `UTF-8` is accepted, but not `UTF8`.
** This function converts the input into numeric ID (as used on Windows)
** and than back to name, where prefered (supported) name should be first.
**
*/ static REBYTE* norm_codepage_name(REBVAL *cp)
/*
***********************************************************************/
{
REBINT id;
REBYTE *name;
id = get_codepage_id(cp);
if (id == -1) return NULL;
for(int i = 0; codepage_alias[i].name != NULL; i++) {
if(codepage_alias[i].codepage == id)
return (REBYTE*)codepage_alias[i].name;
}
return NULL;
}
#endif

/***********************************************************************
Expand Down Expand Up @@ -720,13 +741,20 @@ static REBYTE* get_codepage_name(REBVAL *cp)
size_t dst_size;
size_t nread;
REBCNT tail;
const char *fromcode = cs_cast(get_codepage_name(val_from));

//TODO: currently only strings regisered in the aliases are supported
// but iconv may support more variants, so we could try to convert
// any given codepage names, and do normalization only when it fails!

const char *fromcode = cs_cast(norm_codepage_name(val_from));
const char *tocode;
int wide;

if (!fromcode) Trap1(RE_INVALID_ARG, val_from);
if (!fromcode) {
Trap1(RE_INVALID_ARG, val_from);
}
if (ref_to) {
tocode = cs_cast(get_codepage_name(val_to));
tocode = cs_cast(norm_codepage_name(val_to));
if (!tocode) Trap1(RE_INVALID_ARG, val_to);
wide = 1; // result is raw binary series
} else {
Expand All @@ -737,7 +765,7 @@ static REBYTE* get_codepage_name(REBVAL *cp)
cd = iconv_open(tocode, fromcode);
if (cd == (iconv_t)-1) {
if (get_codepage_id(val_from) < 0) Trap1(RE_INVALID_ARG, val_from);
if (get_codepage_id(val_to) < 0) Trap1(RE_INVALID_ARG, val_to);
if (ref_to && get_codepage_id(val_to) < 0) Trap1(RE_INVALID_ARG, val_to);
Trap0(RE_FEATURE_NA);
}

Expand Down
9 changes: 9 additions & 0 deletions src/tests/units/series-test.r3
Original file line number Diff line number Diff line change
Expand Up @@ -1476,6 +1476,15 @@ Rebol [
;--assert txt = iconv #{FFFE50005901690068006C00E100730069007400} 'UTF16
--assert (next txt) = iconv next #{50F869686CE1736974} 28592

--test-- "ICONV from UTF-8"
;@@ https://github.com/Oldes/Rebol-issues/issues/2475
--assert "š" = iconv #{C5A1} 'utf8
--assert "š" = iconv #{C5A1} 'utf-8
--assert "š" = iconv #{C5A1} 'UTF8
--assert "š" = iconv #{C5A1} 'UTF-8
--assert "š" = iconv #{C5A1} 'CP65001
--assert "š" = iconv #{C5A1} 65001

--test-- "ICONV with empty imput"
--assert "" = iconv #{} 28592
--assert "" = iconv #{} 'utf8
Expand Down

0 comments on commit 0c62ea4

Please sign in to comment.