From 3f3277819613924a26a58c6e4bd33b21eb4bb984 Mon Sep 17 00:00:00 2001 From: Andreas Bolka Date: Tue, 18 Nov 2014 00:18:43 +0100 Subject: [PATCH] Fix encoding detection for READ/string Files with a BOM are now properly handled. Files without a BOM are again decoded using UTF-8. Trying to READ/string a file with an encoding for which support is not yet implemented (UCS4 LE and BE) now causes an error. This fixes zsx/r3#13, regressions introduced by 8336a45 and 70c7c3a, and CureCode issue cc#2186. --- src/core/p-file.c | 2 +- src/core/s-unicode.c | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/core/p-file.c b/src/core/p-file.c index 1abb65fe9b..cba02e5fa5 100644 --- a/src/core/p-file.c +++ b/src/core/p-file.c @@ -206,7 +206,7 @@ REBINT Mode_Syms[] = { if (args & (AM_READ_STRING | AM_READ_LINES)) { REBSER *nser = Decode_UTF_String(BIN_HEAD(ser), file->actual, -1); if (nser == NULL) { - nser = ser; + Trap0(RE_BAD_DECODE); } Set_String(ds, nser); if (args & AM_READ_LINES) Set_Block(ds, Split_Lines(ds)); diff --git a/src/core/s-unicode.c b/src/core/s-unicode.c index 128fe8d998..792635d45a 100644 --- a/src/core/s-unicode.c +++ b/src/core/s-unicode.c @@ -922,7 +922,9 @@ ConversionResult ConvertUTF8toUTF32 ( ** Do all the details to decode a string. ** Input is a byte series. Len is len of input. ** The utf is 0, 8, +/-16, +/-32. -** A special -1 means use the BOM. +** A special -1 means use the BOM, if present, or UTF-8 otherwise. +** +** Returns the decoded string or NULL for unsupported encodings. ** ***********************************************************************/ { @@ -931,17 +933,17 @@ ConversionResult ConvertUTF8toUTF32 ( REBINT size; if (utf == -1) { + // Try to detect UTF encoding from a BOM. Returns 0 if no BOM present. utf = What_UTF(bp, len); - if (utf) { + if (utf != 0) { if (utf == 8) bp += 3, len -= 3; else if (utf == -16 || utf == 16) bp += 2, len -= 2; //else if (utf == -32 || utf == 32) bp += 4, len -= 4; else return NULL; - } else { - return NULL; } } - else if (utf == 0 || utf == 8) { + + if (utf == 0 || utf == 8) { size = Decode_UTF8((REBUNI*)Reset_Buffer(ser, len), bp, len, TRUE); } else if (utf == -16 || utf == 16) { @@ -951,7 +953,8 @@ ConversionResult ConvertUTF8toUTF32 ( // size = Decode_UTF32((REBUNI*)Reset_Buffer(ser, len/4 + 1), bp, len, utf < 0, TRUE); // } else { - return NULL; /* should never be here */ + // Encoding is unsupported or not yet implemented. + return NULL; } if (size < 0) {